Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

vllm use mooncake as pipline by RDMA is not work, error message: transport retry counter exceeded #82

Open
yueyuep opened this issue Jan 16, 2025 · 2 comments

Comments

@yueyuep
Copy link

yueyuep commented Jan 16, 2025

when i use mooncake in vllm with rdma cross node, this was some error,. but in same machine is work.

nodeA:
mooncake.json
1 {
2 "prefill_url": "10.215.192.154:13003",
3 "decode_url": "10.215.192.154:13008",
4 "metadata_server": "http://10.227.73.25:8999",
5 "metadata_backend": "http",
6 "protocol": "rdma",
7 "device_name": "mlx5_1"
8 }

nodeB:
mooncake.json
1 {
2 "prefill_url": "10.215.192.154:13003",
3 "decode_url": "10.215.192.155:13008",
4 "metadata_server": "http://10.227.73.25:8999",
5 "metadata_backend": "http",
6 "protocol": "rdma",
7 "device_name": "mlx5_1"
8 }

test_mooncake_send_recv.py :

import os
import time
from typing import List

import torch
from tqdm import tqdm

from vllm.config import KVTransferConfig
from vllm.distributed.kv_transfer.kv_pipe.mooncake_pipe import MooncakePipe
def test_run(my_rank, pipe):
    print(f"rank {my_rank} test_run starts....")
    # test run
    x = torch.tensor([1]).to(pipe.device)
    y = torch.tensor([[2., 3., 4., 8.]]).to(pipe.device)
    if my_rank == 0:
        pipe.send_tensor(x)
        print(f"rank {my_rank} sent tensor x")
        pipe.send_tensor(y)
        print(f"rank {my_rank} sent tensor y")
        x2 = pipe.recv_tensor()
        print(f"rank {my_rank} received x2 = ", x2)
        y2 = pipe.recv_tensor()
        print(f"rank {my_rank} received y2 = ", y2)

    else:
        x2 = pipe.recv_tensor()
        print(f"rank {my_rank} received x2 = ", x2)
        y2 = pipe.recv_tensor()
        print(f"rank {my_rank} received y2 = ", y2)
        pipe.send_tensor(x)
        print(f"rank {my_rank} sent tensor x")
        pipe.send_tensor(y)
        print(f"rank {my_rank} sent tensor y")

    assert torch.allclose(x, x2)
    assert torch.allclose(y, y2)

    print(f"rank {my_rank} test_run passed!")

if __name__ == "__main__":
    my_rank = int(os.environ['RANK'])

    config = KVTransferConfig(
        kv_connector='MooncakeConnector',
        kv_buffer_size=1e9,
        kv_buffer_device='cuda:0',
        kv_rank=my_rank,
        kv_role="kv_producer" if my_rank == 0 else "kv_consumer",
        kv_parallel_size=2,
    )

    pipe = MooncakePipe(
        local_rank=0,
        config=config,
    )

    test_run(my_rank, pipe)

error summary:
E0114 23:53:14.513487 509 worker_pool.cpp:281] Worker: Process failed for slice (opcode: 0, source_addr: 0x7fa6e3ffd010, length: 391, dest_addr: 140468583125008, local_nic: mlx5_2, peer_nic: 10.215.192.154:13003@mlx5_2, dest_rkey: 12940, retry_cnt: 6): transport retry counter exceeded
I0114 23:53:14.520144 509 transfer_metadata_plugin.cpp:167] Get segment desc, key=mooncake/ram/10.215.192.154:13003, value={"buffers":[{"addr":140466435649552,"length":2147483648,"lkey":[12940],"name":"cpu:0","rkey":[12940]}],"devices":[{"gid":"00:00:00:00:00:00:00:00:00:00:ff:ff:0a:24:21:9a","lid":0,"name":"mlx5_2"}],"name":"10.215.192.154:13003","priority_matrix":{"cpu:0":[["mlx5_2"],[]]},"protocol":"rdma"}
I0114 23:53:14.520956 509 transfer_metadata_plugin.cpp:539] SocketHandShakePlugin: connecting 10.215.192.154:13003
I0114 23:53:14.522734 509 transfer_metadata.cpp:61] TransferHandshakeUtil::decode: local_nic_path 10.215.192.154:13003@mlx5_2 peer_nic_path 10.215.192.155:13008@mlx5_2 qp_num count 2
E0114 23:53:18.808466 509 worker_pool.cpp:281] Worker: Process failed for slice (opcode: 0, source_addr: 0x7fa6e3ffd010, length: 391, dest_addr: 140468583125008, local_nic: mlx5_2, peer_nic: 10.215.192.154:13003@mlx5_2, dest_rkey: 12940, retry_cnt: 7): transport retry counter exceeded
ERROR 01-14 23:53:18 mooncake_pipe.py:161] Transfer Return Error
Traceback (most recent call last):
File "/sgl-workspace/test_mooncake_trans/test_mooncake_send_recv.py", line 137, in
test_run(my_rank, pipe)
File "/sgl-workspace/test_mooncake_trans/test_mooncake_send_recv.py", line 28, in test_run
x2 = pipe.recv_tensor()
File "/usr/local/lib/python3.10/dist-packages/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py", line 259, in recv_tensor
tensor = self.transport_thread.submit(self._recv_impl).result()
File "/usr/lib/python3.10/concurrent/futures/_base.py", line 458, in result
return self.__get_result()
File "/usr/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result
raise self._exception
File "/usr/lib/python3.10/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "/usr/local/lib/python3.10/dist-packages/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py", line 244, in _recv_impl
data = self.transfer_engine.recv_bytes()
File "/usr/local/lib/python3.10/dist-packages/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py", line 194, in recv_bytes
self.transfer_sync(dst_ptr, src_ptr, length)
File "/usr/local/lib/python3.10/dist-packages/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py", line 162, in transfer_sync
raise Exception("Transfer Return Error")
Exception: Transfer Return Error

@alogfans
Copy link
Collaborator

Check you filling of mooncake.json. prefill_url should be IP address of node A, decode_url should be IP address of node B. Both machines share the same content.

@yueyuep
Copy link
Author

yueyuep commented Jan 21, 2025

when i use mooncake in vllm with rdma cross node, this was some error,. but in same machine is work.

nodeA: mooncake.json 1 { 2 "prefill_url": "10.215.192.154:13003", 3 "decode_url": "10.215.192.154:13008", 4 "metadata_server": "http://10.227.73.25:8999", 5 "metadata_backend": "http", 6 "protocol": "rdma", 7 "device_name": "mlx5_1" 8 }

nodeB: mooncake.json 1 { 2 "prefill_url": "10.215.192.154:13003", 3 "decode_url": "10.215.192.155:13008", 4 "metadata_server": "http://10.227.73.25:8999", 5 "metadata_backend": "http", 6 "protocol": "rdma", 7 "device_name": "mlx5_1" 8 }

test_mooncake_send_recv.py :

import os
import time
from typing import List

import torch
from tqdm import tqdm

from vllm.config import KVTransferConfig
from vllm.distributed.kv_transfer.kv_pipe.mooncake_pipe import MooncakePipe
def test_run(my_rank, pipe):
    print(f"rank {my_rank} test_run starts....")
    # test run
    x = torch.tensor([1]).to(pipe.device)
    y = torch.tensor([[2., 3., 4., 8.]]).to(pipe.device)
    if my_rank == 0:
        pipe.send_tensor(x)
        print(f"rank {my_rank} sent tensor x")
        pipe.send_tensor(y)
        print(f"rank {my_rank} sent tensor y")
        x2 = pipe.recv_tensor()
        print(f"rank {my_rank} received x2 = ", x2)
        y2 = pipe.recv_tensor()
        print(f"rank {my_rank} received y2 = ", y2)

    else:
        x2 = pipe.recv_tensor()
        print(f"rank {my_rank} received x2 = ", x2)
        y2 = pipe.recv_tensor()
        print(f"rank {my_rank} received y2 = ", y2)
        pipe.send_tensor(x)
        print(f"rank {my_rank} sent tensor x")
        pipe.send_tensor(y)
        print(f"rank {my_rank} sent tensor y")

    assert torch.allclose(x, x2)
    assert torch.allclose(y, y2)

    print(f"rank {my_rank} test_run passed!")

if __name__ == "__main__":
    my_rank = int(os.environ['RANK'])

    config = KVTransferConfig(
        kv_connector='MooncakeConnector',
        kv_buffer_size=1e9,
        kv_buffer_device='cuda:0',
        kv_rank=my_rank,
        kv_role="kv_producer" if my_rank == 0 else "kv_consumer",
        kv_parallel_size=2,
    )

    pipe = MooncakePipe(
        local_rank=0,
        config=config,
    )

    test_run(my_rank, pipe)

error summary: E0114 23:53:14.513487 509 worker_pool.cpp:281] Worker: Process failed for slice (opcode: 0, source_addr: 0x7fa6e3ffd010, length: 391, dest_addr: 140468583125008, local_nic: mlx5_2, peer_nic: 10.215.192.154:13003@mlx5_2, dest_rkey: 12940, retry_cnt: 6): transport retry counter exceeded I0114 23:53:14.520144 509 transfer_metadata_plugin.cpp:167] Get segment desc, key=mooncake/ram/10.215.192.154:13003, value={"buffers":[{"addr":140466435649552,"length":2147483648,"lkey":[12940],"name":"cpu:0","rkey":[12940]}],"devices":[{"gid":"00:00:00:00:00:00:00:00:00:00:ff:ff:0a:24:21:9a","lid":0,"name":"mlx5_2"}],"name":"10.215.192.154:13003","priority_matrix":{"cpu:0":[["mlx5_2"],[]]},"protocol":"rdma"} I0114 23:53:14.520956 509 transfer_metadata_plugin.cpp:539] SocketHandShakePlugin: connecting 10.215.192.154:13003 I0114 23:53:14.522734 509 transfer_metadata.cpp:61] TransferHandshakeUtil::decode: local_nic_path 10.215.192.154:13003@mlx5_2 peer_nic_path 10.215.192.155:13008@mlx5_2 qp_num count 2 E0114 23:53:18.808466 509 worker_pool.cpp:281] Worker: Process failed for slice (opcode: 0, source_addr: 0x7fa6e3ffd010, length: 391, dest_addr: 140468583125008, local_nic: mlx5_2, peer_nic: 10.215.192.154:13003@mlx5_2, dest_rkey: 12940, retry_cnt: 7): transport retry counter exceeded ERROR 01-14 23:53:18 mooncake_pipe.py:161] Transfer Return Error Traceback (most recent call last): File "/sgl-workspace/test_mooncake_trans/test_mooncake_send_recv.py", line 137, in test_run(my_rank, pipe) File "/sgl-workspace/test_mooncake_trans/test_mooncake_send_recv.py", line 28, in test_run x2 = pipe.recv_tensor() File "/usr/local/lib/python3.10/dist-packages/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py", line 259, in recv_tensor tensor = self.transport_thread.submit(self._recv_impl).result() File "/usr/lib/python3.10/concurrent/futures/_base.py", line 458, in result return self.__get_result() File "/usr/lib/python3.10/concurrent/futures/_base.py", line 403, in __get_result raise self._exception File "/usr/lib/python3.10/concurrent/futures/thread.py", line 58, in run result = self.fn(*self.args, **self.kwargs) File "/usr/local/lib/python3.10/dist-packages/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py", line 244, in _recv_impl data = self.transfer_engine.recv_bytes() File "/usr/local/lib/python3.10/dist-packages/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py", line 194, in recv_bytes self.transfer_sync(dst_ptr, src_ptr, length) File "/usr/local/lib/python3.10/dist-packages/vllm/distributed/kv_transfer/kv_pipe/mooncake_pipe.py", line 162, in transfer_sync raise Exception("Transfer Return Error") Exception: Transfer Return Error

fix this issure by set export MC_GID_INDEX=3 , it seem getBestGidIndex select gid is not work

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

2 participants