From f70ec000baadecefda56c43744f51d86e2d8f634 Mon Sep 17 00:00:00 2001 From: jainapurva Date: Thu, 16 Jan 2025 17:26:46 -0800 Subject: [PATCH 01/13] Add H100 to CI for fp8 --- .github/workflows/float8_test.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/float8_test.yml b/.github/workflows/float8_test.yml index 75482c9e24..6e0d0f9b5a 100644 --- a/.github/workflows/float8_test.yml +++ b/.github/workflows/float8_test.yml @@ -28,6 +28,11 @@ jobs: torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu121' gpu-arch-type: "cuda" gpu-arch-version: "12.1" + - name: H100 + runs-on: linux.aws.h100 + torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu121' + gpu-arch-type: "cuda" + gpu-arch-version: "12.1" uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: From fe250c23b7da526d77ebc7cfd01ddd1c358d3f07 Mon Sep 17 00:00:00 2001 From: jainapurva Date: Thu, 16 Jan 2025 17:37:12 -0800 Subject: [PATCH 02/13] Test --- .github/workflows/float8_test.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/float8_test.yml b/.github/workflows/float8_test.yml index 6e0d0f9b5a..75482c9e24 100644 --- a/.github/workflows/float8_test.yml +++ b/.github/workflows/float8_test.yml @@ -28,11 +28,6 @@ jobs: torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu121' gpu-arch-type: "cuda" gpu-arch-version: "12.1" - - name: H100 - runs-on: linux.aws.h100 - torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu121' - gpu-arch-type: "cuda" - gpu-arch-version: "12.1" uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: From 07b266cbf610abc195343564579936c26349117a Mon Sep 17 00:00:00 2001 From: jainapurva Date: Thu, 16 Jan 2025 17:40:07 -0800 Subject: [PATCH 03/13] Test --- .github/workflows/float8_test.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/float8_test.yml b/.github/workflows/float8_test.yml index 75482c9e24..6e0d0f9b5a 100644 --- a/.github/workflows/float8_test.yml +++ b/.github/workflows/float8_test.yml @@ -28,6 +28,11 @@ jobs: torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu121' gpu-arch-type: "cuda" gpu-arch-version: "12.1" + - name: H100 + runs-on: linux.aws.h100 + torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu121' + gpu-arch-type: "cuda" + gpu-arch-version: "12.1" uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: From 1529aad91349f9ecd7417380d12cff82f558020e Mon Sep 17 00:00:00 2001 From: jainapurva Date: Thu, 16 Jan 2025 17:41:44 -0800 Subject: [PATCH 04/13] H100 in CI --- .github/workflows/regression_test.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/regression_test.yml b/.github/workflows/regression_test.yml index 74b39d2ef2..6af533b176 100644 --- a/.github/workflows/regression_test.yml +++ b/.github/workflows/regression_test.yml @@ -87,6 +87,11 @@ jobs: torch-spec: 'torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu' gpu-arch-type: "cpu" gpu-arch-version: "" + - name: H100 + runs-on: linux.aws.h100 + torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu121' + gpu-arch-type: "cuda" + gpu-arch-version: "12.1" uses: pytorch/test-infra/.github/workflows/linux_job.yml@main with: From 3262b9bdd955a1c0d9cd91185b776d9f29c15b0d Mon Sep 17 00:00:00 2001 From: jainapurva Date: Thu, 16 Jan 2025 17:50:40 -0800 Subject: [PATCH 05/13] Fix CI linux_job permissions --- .github/workflows/float8_test.yml | 3 +++ .github/workflows/nightly_smoke_test.yml | 6 ++++-- .github/workflows/regression_test.yml | 3 +++ 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/.github/workflows/float8_test.yml b/.github/workflows/float8_test.yml index 75482c9e24..7c9e5a4b00 100644 --- a/.github/workflows/float8_test.yml +++ b/.github/workflows/float8_test.yml @@ -29,6 +29,9 @@ jobs: gpu-arch-type: "cuda" gpu-arch-version: "12.1" + permissions: + id-token: write + contents: read uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: timeout: 60 diff --git a/.github/workflows/nightly_smoke_test.yml b/.github/workflows/nightly_smoke_test.yml index d215f22ed2..18d4f41af6 100644 --- a/.github/workflows/nightly_smoke_test.yml +++ b/.github/workflows/nightly_smoke_test.yml @@ -11,7 +11,7 @@ concurrency: cancel-in-progress: true env: - HF_TOKEN: ${{ secrets.HF_TOKEN }} + HF_TOKEN: ${{ secrets.HF_TOKEN }} jobs: test: @@ -25,7 +25,9 @@ jobs: gpu-arch-type: "cuda" gpu-arch-version: "12.1" - + permissions: + id-token: write + contents: read uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: runner: ${{ matrix.runs-on }} diff --git a/.github/workflows/regression_test.yml b/.github/workflows/regression_test.yml index 74b39d2ef2..19c033c4d1 100644 --- a/.github/workflows/regression_test.yml +++ b/.github/workflows/regression_test.yml @@ -34,6 +34,9 @@ jobs: gpu-arch-type: "cpu" gpu-arch-version: "" + permissions: + id-token: write + contents: read uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main with: timeout: 120 From 08bccdc13c09dfe9db03aa131eb1e99b4677ceea Mon Sep 17 00:00:00 2001 From: jainapurva Date: Fri, 17 Jan 2025 14:09:14 -0800 Subject: [PATCH 06/13] Fix CI float8 test --- test/integration/test_integration.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/integration/test_integration.py b/test/integration/test_integration.py index 1087db8cf8..c926cee060 100644 --- a/test/integration/test_integration.py +++ b/test/integration/test_integration.py @@ -1821,7 +1821,7 @@ def test_autoquant_int4wo(self, device, dtype): self.assertGreater(compute_error(ref, out), 20) @parameterized.expand(COMMON_DEVICE_DTYPE) - @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + @unittest.skipIf(not is_sm_at_least_90(), "Need cuda arch greater than SM90") @unittest.skipIf( not TORCH_VERSION_AT_LEAST_2_5, "autoquant int4 option requires 2.5+." ) From a0f6769f4e8d03718b638a271a52e4b6fa39fb3c Mon Sep 17 00:00:00 2001 From: jainapurva Date: Fri, 17 Jan 2025 16:00:31 -0800 Subject: [PATCH 07/13] Checking H100 --- .github/workflows/float8_test.yml | 4 ++-- .github/workflows/regression_test.yml | 10 +++++----- test/float8/test_compile.py | 2 +- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/workflows/float8_test.yml b/.github/workflows/float8_test.yml index d36647bc48..84beafbea1 100644 --- a/.github/workflows/float8_test.yml +++ b/.github/workflows/float8_test.yml @@ -30,9 +30,9 @@ jobs: gpu-arch-version: "12.1" - name: H100 runs-on: linux.aws.h100 - torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu121' + torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu124' gpu-arch-type: "cuda" - gpu-arch-version: "12.1" + gpu-arch-version: "12.4" permissions: id-token: write diff --git a/.github/workflows/regression_test.yml b/.github/workflows/regression_test.yml index 5358f52ceb..9940f9d2fc 100644 --- a/.github/workflows/regression_test.yml +++ b/.github/workflows/regression_test.yml @@ -74,6 +74,11 @@ jobs: torch-spec: 'torch==2.5.1 --index-url https://download.pytorch.org/whl/cu121' gpu-arch-type: "cuda" gpu-arch-version: "12.1" + - name: H100 + runs-on: linux.aws.h100 + torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu121' + gpu-arch-type: "cuda" + gpu-arch-version: "12.4" - name: CPU 2.3 runs-on: linux.4xlarge @@ -90,11 +95,6 @@ jobs: torch-spec: 'torch==2.5.1 --index-url https://download.pytorch.org/whl/cpu' gpu-arch-type: "cpu" gpu-arch-version: "" - - name: H100 - runs-on: linux.aws.h100 - torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu121' - gpu-arch-type: "cuda" - gpu-arch-version: "12.1" uses: pytorch/test-infra/.github/workflows/linux_job.yml@main with: diff --git a/test/float8/test_compile.py b/test/float8/test_compile.py index c42ab8ee77..d7d4d75811 100644 --- a/test/float8/test_compile.py +++ b/test/float8/test_compile.py @@ -469,7 +469,7 @@ def test_dynamic_scale_numeric_parity(dtype: torch.dtype): @unittest.skipIf( - not is_sm_at_least_89() or not is_fbcode(), + not is_sm_at_least_89() or is_fbcode(), "CUDA with float8 support not available; or not on fbcode (the test needs be run with the latest pytorch package)", ) @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32]) From 529be58ac94e2d852f5c2162292e24c13bb5de3e Mon Sep 17 00:00:00 2001 From: jainapurva Date: Mon, 20 Jan 2025 23:59:52 -0800 Subject: [PATCH 08/13] Remove compile updates --- test/float8/test_compile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/float8/test_compile.py b/test/float8/test_compile.py index d7d4d75811..c42ab8ee77 100644 --- a/test/float8/test_compile.py +++ b/test/float8/test_compile.py @@ -469,7 +469,7 @@ def test_dynamic_scale_numeric_parity(dtype: torch.dtype): @unittest.skipIf( - not is_sm_at_least_89() or is_fbcode(), + not is_sm_at_least_89() or not is_fbcode(), "CUDA with float8 support not available; or not on fbcode (the test needs be run with the latest pytorch package)", ) @pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16, torch.float32]) From 644cd10f70458e8f87629c0f3d9cbf52e384d0b9 Mon Sep 17 00:00:00 2001 From: jainapurva Date: Thu, 23 Jan 2025 14:45:44 -0800 Subject: [PATCH 09/13] Revert regression_test.yml --- .github/workflows/regression_test.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/regression_test.yml b/.github/workflows/regression_test.yml index e5f2d540f2..14c31014c3 100644 --- a/.github/workflows/regression_test.yml +++ b/.github/workflows/regression_test.yml @@ -74,11 +74,6 @@ jobs: torch-spec: 'torch==2.5.1 --index-url https://download.pytorch.org/whl/cu121' gpu-arch-type: "cuda" gpu-arch-version: "12.1" - - name: H100 - runs-on: linux.aws.h100 - torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu121' - gpu-arch-type: "cuda" - gpu-arch-version: "12.4" - name: CPU 2.3 runs-on: linux.4xlarge From cc06016cae712767032eaa19035d040aaed9dfeb Mon Sep 17 00:00:00 2001 From: jainapurva Date: Thu, 23 Jan 2025 15:58:05 -0800 Subject: [PATCH 10/13] Fixed nightly version --- .github/workflows/float8_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/float8_test.yml b/.github/workflows/float8_test.yml index 52a1b478e8..42a5861624 100644 --- a/.github/workflows/float8_test.yml +++ b/.github/workflows/float8_test.yml @@ -30,7 +30,7 @@ jobs: gpu-arch-version: "12.4" - name: H100 runs-on: linux.aws.h100 - torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu124' + torch-spec: '--pre torch==2.6.0 --index-url https://download.pytorch.org/whl/nightly/cu124' gpu-arch-type: "cuda" gpu-arch-version: "12.4" From 2c2a02b6552a679d0cab14e5d5faeb939b0357a3 Mon Sep 17 00:00:00 2001 From: Daniel Vega-Myhre Date: Thu, 23 Jan 2025 15:59:23 -0800 Subject: [PATCH 11/13] [BE] Only run docs build in CI if docs have changed (#1589) only run docs build in CI if docs have changed --- .github/workflows/doc_build.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/doc_build.yml b/.github/workflows/doc_build.yml index 19c1204e6d..d16ed0340b 100644 --- a/.github/workflows/doc_build.yml +++ b/.github/workflows/doc_build.yml @@ -9,6 +9,9 @@ on: tags: - v[0-9]+.[0-9]+.[0-9] - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ + paths: + - 'docs/**' + - '!docs/**' pull_request: workflow_dispatch: From 2882f1ee950b60633da84eea2c814b61f70116c8 Mon Sep 17 00:00:00 2001 From: Daniel Vega-Myhre Date: Thu, 23 Jan 2025 16:00:48 -0800 Subject: [PATCH 12/13] [float8nocompile] Add float8nocompile CI tests which only trigger on relevant code changes (#1570) add float8nocompile CI tests --- .github/workflows/float8nocompile_test.yaml | 55 +++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 .github/workflows/float8nocompile_test.yaml diff --git a/.github/workflows/float8nocompile_test.yaml b/.github/workflows/float8nocompile_test.yaml new file mode 100644 index 0000000000..75df32a5d4 --- /dev/null +++ b/.github/workflows/float8nocompile_test.yaml @@ -0,0 +1,55 @@ +name: Run Float8nocompile Tests + +on: + push: + branches: + - main + - 'gh/**' + paths: + - 'torchao/prototype/float8nocompile/**' + - '!torchao/prototype/float8nocompile/**' + pull_request: + branches: + - main + - 'gh/**' + paths: + - 'torchao/prototype/float8nocompile/**' + - '!torchao/prototype/float8nocompile/**' + +concurrency: + group: floatnocompile_test-${{ github.workflow }}-${{ github.ref == 'refs/heads/main' && github.run_number || github.ref }} + cancel-in-progress: true + +env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + +jobs: + test: + strategy: + fail-fast: false + matrix: + include: + - name: SM-89 + runs-on: linux.g6.4xlarge.experimental.nvidia.gpu + torch-spec: '--pre torch --index-url https://download.pytorch.org/whl/nightly/cu121' + gpu-arch-type: "cuda" + gpu-arch-version: "12.1" + + uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main + with: + timeout: 300 + runner: ${{ matrix.runs-on }} + gpu-arch-type: ${{ matrix.gpu-arch-type }} + gpu-arch-version: ${{ matrix.gpu-arch-version }} + submodules: recursive + script: | + conda create -n venv python=3.9 -y + conda activate venv + export PATH=/opt/rh/devtoolset-10/root/usr/bin/:$PATH + python -m pip install --upgrade pip + pip install ${{ matrix.torch-spec }} + pip install -r dev-requirements.txt + pip install . + cd torchao/prototype/float8nocompile + pytest kernels/ --verbose -s + pytest test/train_test.py --verbose -s From 0118d2c876d28508ea270b0530aab8242886404e Mon Sep 17 00:00:00 2001 From: jainapurva Date: Thu, 23 Jan 2025 17:43:21 -0800 Subject: [PATCH 13/13] Torch fixes --- .github/workflows/float8_test.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/float8_test.yml b/.github/workflows/float8_test.yml index 42a5861624..3cf2d13933 100644 --- a/.github/workflows/float8_test.yml +++ b/.github/workflows/float8_test.yml @@ -30,7 +30,7 @@ jobs: gpu-arch-version: "12.4" - name: H100 runs-on: linux.aws.h100 - torch-spec: '--pre torch==2.6.0 --index-url https://download.pytorch.org/whl/nightly/cu124' + torch-spec: '--pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu124' gpu-arch-type: "cuda" gpu-arch-version: "12.4"