ROCm · micmelesse · Jan 31, 2025 · Feb 7, 2025 · Feb 7, 2025 · Feb 7, 2025
diff --git a/.github/workflows/amd_tests.yml b/.github/workflows/amd_tests.yml
@@ -16,71 +16,75 @@ jobs:
     runs-on: ${{ matrix.runner }}
     strategy:
       matrix:
-        runner: [linux-mi300-gpu-1]
+        runner: [linux-mi300-gpu-1, gfx1100]
       fail-fast: false # disables failing the entire job when one matrix entry fails
     container:
       image: rocm/pytorch:rocm6.2.3_ubuntu22.04_py3.10_pytorch_release_2.3.0
       options: --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
     steps:
       - name: Checkout
         uses: actions/checkout@v4
+
       - name: Show Device Info
         run: |
           rocminfo | grep gfx
+
       - name: Uninstall Triton
         run : |
           pip uninstall -y triton
           rm -rf ~/.triton
           rm -rf ./triton/python/build
+
       - name: Install Triton
         run: |
           git clone https://github.com/triton-lang/triton
           cd triton
           git checkout 3ca2f498e98ed7249b82722587c511a5610e00c4
           pip install ninja cmake wheel pybind11 # build-time dependencies
-          pip install matplotlib pandas pytest # triton bench dependencies
+          pip install matplotlib pandas pytest pytest-randomly # triton bench dependencies
           pip install --verbose --no-build-isolation ./python
           cd ..
+
       - name: Show Triton version
         run: |
           pip show triton
+
       - name: Build
         run: |
           export FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
           python setup.py install
 
-      # CDNA Tests
-      - name: Flash Attention Tests Using Reference Impl
+      - name: Flash Attention Tests using Pytorch reference implementation
         if: matrix.runner == 'linux-mi300-gpu-1'
         run: |
           export FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
-          export FLASH_ATTENTION_TRITON_AMD_REF=1
-          pytest tests/test_flash_attn_triton_amd.py
+          FLASH_ATTENTION_TRITON_AMD_REF=1 pytest tests/test_flash_attn_triton_amd.py
+
+      # CDNA Tests
       - name: Flash Attention CDNA Tests
         if: matrix.runner == 'linux-mi300-gpu-1'
         run: |
           export FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
           pytest tests/test_flash_attn_triton_amd.py
+
+      # FIXME: run the full suite
       - name: AMD Tests
         if: matrix.runner == 'linux-mi300-gpu-1'
         run: |
           export FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
           pytest -v -s flash_attn/flash_attn_triton_amd/test.py::test_op_prefill_fp8 flash_attn/flash_attn_triton_amd/test.py::test_op_prefill_varlen_fp8
+
       - name: AMD Bench
         if: matrix.runner == 'linux-mi300-gpu-1'
         run: |
           export FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
-          python flash_attn/flash_attn_triton_amd/bench.py
-      - name: AMD Bench with Autotune
-        if: matrix.runner == 'linux-mi300-gpu-1'
-        run: |
-          export FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
-          export FLASH_ATTENTION_TRITON_AMD_AUTOTUNE=1
-          python flash_attn/flash_attn_triton_amd/bench.py
+          FLASH_ATTENTION_TRITON_AMD_AUTOTUNE=1 python flash_attn/flash_attn_triton_amd/bench.py
 
       # RDNA Tests
       - name: Flash Attention RDNA Tests
         if: matrix.runner == 'gfx1100'
         run: |
           export FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
-          pytest tests/test_flash_attn_triton_amd.py::test_flash_attn_output tests/test_flash_attn_triton_amd.py::test_flash_attn_varlen_output tests/test_flash_attn_triton_amd.py::test_flash_attn_kvcache
+
+          # NOTE: this exceeds 6 hrs on "gfx1100" so sample a subset of the tests. The full suite is run on a CDNA machine.
+          pytest --randomly-seed=42 --randomly-sample=0.10 tests/test_flash_attn_triton_amd.py