microsoft · Binyang2014 · Jan 22, 2025 · Dec 31, 2024 · Jan 1, 2025 · Jan 1, 2025
diff --git a/.azure-pipelines/nccl-api-test.yaml b/.azure-pipelines/nccl-api-test.yaml
@@ -25,7 +25,7 @@ jobs:
 
   steps:
   - checkout: self
-  - checkout: git://One/msccl-users
+  - checkout: git://One/msccl-users@binyli/merge-script 
   - task: Bash@3
     name: Build
     displayName: Build
@@ -87,23 +87,6 @@ jobs:
         parallel-scp -t 0 -r -h ${HOSTFILE} -x "-i ${KeyFilePath}" -O $SSH_OPTION ${ROOT_DIR} ${DST_DIR}
       workingDirectory: '$(System.DefaultWorkingDirectory)'
 
-  - task: Bash@3
-    name: InstallMscclTools
-    displayName: Install msccl-tools
-    inputs:
-      targetType: 'inline'
-      script: |
-        set -e
-        HOSTFILE=$(System.DefaultWorkingDirectory)/mscclpp/test/deploy/hostfile_ci
-        SSH_OPTION="StrictHostKeyChecking=no"
-        KeyFilePath=${SSHKEYFILE_SECUREFILEPATH}
-        parallel-ssh -i -t 0 -h ${HOSTFILE} -x "-i ${KeyFilePath}"   \
-          -O $SSH_OPTION 'sudo docker exec -t mscclpp-test bash -c " \
-          cd /root/mscclpp;                                          \
-          git clone https://github.com/Azure/msccl-tools.git;        \
-          cd /root/mscclpp/msccl-tools; pip3 install ."'
-      workingDirectory: '$(System.DefaultWorkingDirectory)'
-
   - task: Bash@3
     name: GenerateExecutionFile
     displayName: Generate execution file

diff --git a/.github/workflows/mscclpp-lang.yml b/.github/workflows/mscclpp-lang.yml
@@ -0,0 +1,46 @@
+name: MSCCLPPLang
+
+on:
+  pull_request:
+    branches:
+      - main
+      - release/*
+
+jobs:
+  compare-diffs:
+    runs-on: 'ubuntu-latest'
+    container:
+      image: ghcr.io/microsoft/mscclpp/mscclpp:base-dev-${{ matrix.version }}
+
+    strategy:
+        fail-fast: false
+        matrix:
+          version: [ 'cuda11.8', 'cuda12.2' ]
+
+    steps:
+    - uses: actions/checkout@v4
+    - name: Install mscclpp
+      run: |
+        CMAKE_ARGS="-DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON" pip3 install .
+
+    - name: Copy test script/config to temp directory
+      run: |
+        cp python/test/generate_mscclpp_lang_test_result.py $RUNNER_TEMP/
+        cp python/test/configs/mscclpp_lang_test_config.json $RUNNER_TEMP/
+    - name: generate outputs
+      run: |
+        python3 $RUNNER_TEMP/generate_mscclpp_lang_test_result.py python/examples/ $RUNNER_TEMP/mscclpp_lang_test_config.json $RUNNER_TEMP/tests/pr-outputs/
+    - name: Checkout main branch
+      uses: actions/checkout@v4
+      if: github.event_name == 'pull_request' || github.event_name == 'push'
+      with:
+        ref: main
+    - name: Install msccl and dependencies
+      run: |
+        CMAKE_ARGS="-DMSCCLPP_BYPASS_GPU_CHECK=ON -DMSCCLPP_USE_CUDA=ON" pip3 install .
+    - name: generate outputs
+      run: |
+        python3 $RUNNER_TEMP/generate_mscclpp_lang_test_result.py python/examples/ $RUNNER_TEMP/mscclpp_lang_test_config.json $RUNNER_TEMP/tests/main-outputs/
+    - name: Compare outputs
+      run: |
+        diff -rw $RUNNER_TEMP/tests/main-outputs/ $RUNNER_TEMP/tests/pr-outputs/
diff --git a/python/examples/allgather_barrier.py b/python/examples/allgather_barrier.py
@@ -0,0 +1,47 @@
+import argparse
+from mscclpp.language import *
+from mscclpp.language.buffer import Buffer
+from mscclpp.language.collectives import AllGather
+from mscclpp.language.types import ChannelType, ReplicationPolicy
+
+
+def allgather_test(gpus, instances):
+    size = gpus
+    collective = AllGather(size, 1, False)
+    with MSCCLPPProgram(
+        "allgather_with_barrier",
+        collective,
+        size,
+        instances,
+        protocol="Simple",
+        replication_policy=ReplicationPolicy.interleaved,
+    ):
+        for n in range(gpus):
+            c = chunk(n, Buffer.input, 0, 1)
+            for peer in range(gpus):
+                if n != peer:
+                    c.put(peer, Buffer.output, n, sendtb=peer, chan_type=ChannelType.sm)
+                else:
+                    c.copy(n, Buffer.output, n, sendtb=peer)
+            # explicit barrier
+            r = rank(n)
+            r.barrier(tb_list=list(range(gpus)))
+            for peer in range(gpus):
+                if n != peer:
+                    c.signal(peer, Buffer.output, n, sendtb=peer, chan_type=ChannelType.sm)
+
+        for n in range(gpus):
+            for peer in range(gpus):
+                c = chunk(n, Buffer.output, peer, 1)
+                if n != peer:
+                    c.wait(peer, Buffer.input, peer, recvtb=peer, chan_type=ChannelType.sm)
+
+        Json()
+        Check()
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("num_gpus", type=int, help="number of gpus")
+parser.add_argument("instances", type=int, help="number of instances")
+args = parser.parse_args()
+allgather_test(args.num_gpus, args.instances)
diff --git a/python/examples/allreduce_allpairs.py b/python/examples/allreduce_allpairs.py
@@ -0,0 +1,57 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from mscclpp.language import *
+from mscclpp.language.collectives import AllReduce
+from mscclpp.language.buffer import Buffer
+
+
+def allreduce_allpairs(gpus, instances, protocol):
+    size = gpus
+    chunksperloop = gpus * gpus
+    collective = AllReduce(size, chunksperloop, True)
+    with MSCCLPPProgram("allreduce_pairs", collective, size, instances, protocol=protocol):
+        for rank in range(size):
+            for tb in range(size):
+                index = rank * size
+                c = chunk(rank, Buffer.input, index + tb)
+                # step1 make sure the data is ready
+                for nghr in range(size):
+                    peer_index = nghr * size
+                    if rank != nghr:
+                        # signal peer the buffer is ready
+                        c_peer = chunk(rank, Buffer.input, peer_index + tb)
+                        c_peer.signal(nghr, Buffer.input, peer_index + tb, sendtb=tb)
+                for nghr in range(size):
+                    if rank != nghr:
+                        c.wait(nghr, Buffer.input, index + tb, recvtb=tb)
+                # step2 reduce the chunks and send to peers
+                for nghr in range(size):
+                    if rank != nghr:
+                        c.reduce(chunk(nghr, Buffer.input, index + tb), recvtb=tb)
+                for nghr in range(size):
+                    if rank != nghr:
+                        c.put(nghr, Buffer.input, index + tb, sendtb=tb)
+                # step3 signal the peers buffer is ready
+                for nghr in range(size):
+                    if rank != nghr:
+                        c.signal(nghr, Buffer.input, index + tb, sendtb=tb)
+                for nghr in range(size):
+                    if rank != nghr:
+                        peer_index = nghr * size
+                        c_peer = chunk(rank, Buffer.input, peer_index + tb)
+                        c_peer.wait(nghr, Buffer.input, peer_index + tb, recvtb=tb)
+
+        Json()
+        Check()
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("num_gpus", type=int, help="number of gpus")
+parser.add_argument("instances", type=int, help="number of instances")
+parser.add_argument("--protocol", type=str, default="Simple", choices=["Simple"], help="Protocol")
+
+args = parser.parse_args()
+
+allreduce_allpairs(args.num_gpus, args.instances, args.protocol)
diff --git a/python/examples/allreduce_allpairs_get.py b/python/examples/allreduce_allpairs_get.py
@@ -0,0 +1,70 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from mscclpp.language import *
+from mscclpp.language.collectives import AllReduce
+from mscclpp.language.buffer import Buffer
+
+
+def allreduce_allpairs(gpus, instances):
+    size = gpus
+    chunksperloop = gpus * gpus
+    collective = AllReduce(size, chunksperloop, True)
+    with MSCCLPPProgram(
+        "allreduce_pairs",
+        collective,
+        size,
+        instances,
+        protocol="Simple",
+    ):
+
+        # Each rank sends the nth chunk to the nth rank into scratch space
+        for rank in range(size):
+            for tb in range(size):
+                index = rank * size
+                c = chunk(rank, Buffer.input, index + tb)
+                # make sure the data is ready
+                for nghr in range(size):
+                    peer_index = nghr * size
+                    if rank != nghr:
+                        c_peer = chunk(rank, Buffer.input, peer_index + tb)
+                        c_peer.signal(nghr, Buffer.input, peer_index + tb, sendtb=tb)
+                for nghr in range(size):
+                    if rank != nghr:
+                        c.wait(nghr, Buffer.input, index + tb, recvtb=tb)
+                # reduce the chunks
+                for i in range(size):
+                    nghr = (rank + i) % size
+                    if rank != nghr:
+                        c.reduce(chunk(nghr, Buffer.input, index + tb), recvtb=tb)
+                for nghr in range(size):
+                    if rank != nghr:
+                        c.signal(nghr, Buffer.input, index + tb, sendtb=tb)
+
+        # wait for all the chunks is ready, then get the chunks
+        for rank in range(size):
+            for tb in range(size):
+                for nghr in range(size):
+                    if rank != nghr:
+                        index = nghr * size
+                        c = chunk(rank, Buffer.input, index + tb)
+                        c.wait(nghr, Buffer.input, index + tb, recvtb=tb)
+                for i in range(size):
+                    nghr = (rank + i) % size
+                    index = nghr * size
+                    if rank != nghr:
+                        c = chunk(rank, Buffer.input, index + tb)
+                        c.get(nghr, Buffer.input, index + tb, recvtb=tb)
+
+        Json()
+        Check()
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("num_gpus", type=int, help="number of gpus")
+parser.add_argument("instances", type=int, help="number of instances")
+
+args = parser.parse_args()
+
+allreduce_allpairs(args.num_gpus, args.instances)
diff --git a/python/examples/allreduce_allpairs_packet.py b/python/examples/allreduce_allpairs_packet.py
@@ -0,0 +1,61 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from mscclpp.language import *
+from mscclpp.language.collectives import AllReduce
+from mscclpp.language.buffer import Buffer
+
+
+def allreduce_allpairs(gpus, instances):
+    size = gpus
+    chunksperloop = gpus * gpus
+    collective = AllReduce(size, chunksperloop, True)
+    with MSCCLPPProgram(
+        "allreduce_packets",
+        collective,
+        size,
+        instances,
+        protocol="LL",
+        use_double_scratch_buffer=True,
+    ):
+        # Each rank sends the nth chunk to the nth rank into scratch space
+        for r1 in range(size):
+            for tb in range(size):
+                if tb == r1:
+                    continue
+                remote_rank = tb
+                index = remote_rank * size
+                c = chunk(r1, Buffer.input, index, size)
+                c.put_packet(remote_rank, "scratch", index=r1 * size, sendtb=tb)
+
+        # Each rank performs a local reduction on the nth chunk
+        # Utilize 8 threadblocks for this reduction for better parallelism
+        for r in range(size):
+            for index in range(size):
+                c = chunk(r, Buffer.input, r * size + index)
+                for peer in range(size):
+                    if peer != r:
+                        c.reduce_packet(chunk(r, "scratch", peer * size + index), recvtb=index)
+                for peer in range(size):
+                    if peer != r:
+                        c.put_packet(peer, "scratch", (size * size) + r * size + index, sendtb=index)
+
+        # Each rank get final result from scratch space
+        for r in range(size):
+            for peer in range(size):
+                if peer != r:
+                    c = chunk(r, "scratch", size * size + peer * size, size)
+                    c.copy_packet(r, Buffer.input, peer * size, sendtb=peer)
+
+        Json()
+        Check()
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("num_gpus", type=int, help="number of gpus")
+parser.add_argument("instances", type=int, help="number of instances")
+
+args = parser.parse_args()
+
+allreduce_allpairs(args.num_gpus, args.instances)
diff --git a/python/examples/allreduce_nvls.py b/python/examples/allreduce_nvls.py
@@ -0,0 +1,48 @@
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+import argparse
+from mscclpp.language import *
+from mscclpp.language.collectives import AllReduce
+from mscclpp.language.buffer import Buffer
+
+
+def allreduce_allpairs(gpus, instances):
+    size = gpus
+    chunksperloop = gpus
+    collective = AllReduce(size, chunksperloop, True)
+    with MSCCLPPProgram(
+        "allreduce_nvls",
+        collective,
+        size,
+        instances,
+    ):
+        # Each rank sends the nth chunk to the nth rank into scratch space
+        for rank in range(size):
+            index = rank
+            c = chunk(rank, Buffer.input, index)
+            reduce_chunks = []
+            # make sure the data is ready
+            for nghr in range(size):
+                if rank != nghr:
+                    c_peer = chunk(nghr, Buffer.input, index)
+                    reduce_chunks.append(c_peer)
+                    c.signal(nghr, Buffer.input, index, sendtb=0)
+            for nghr in range(size):
+                if rank != nghr:
+                    c.wait(nghr, Buffer.input, index, recvtb=0)
+            c = c.group_load_reduce(reduce_chunks, recvtb=0)
+            ngbrs = [nghr for nghr in range(size) if nghr != rank]
+            c.group_store(ngbrs, sendtb=0)
+
+        Json()
+        Check()
+
+
+parser = argparse.ArgumentParser()
+parser.add_argument("num_gpus", type=int, help="number of gpus")
+parser.add_argument("instances", type=int, help="number of instances")
+
+args = parser.parse_args()
+
+allreduce_allpairs(args.num_gpus, args.instances)