Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding DS Feature API in accelerator #5423

Open
wants to merge 17 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion accelerator/abstract_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,16 @@

import abc
from abc import ABC

from .constants import *
from deepspeed.git_version_info import compatible_ops as __compatible_ops__

class DeepSpeedAccelerator(ABC):

def __init__(self):
self._name = None
self._communication_backend_name = None
self._ds_features: dict[str, bool] = {ZERO_1: False, ZERO_2: False, ZERO_3: False}
self._ds_features.update({op: compatibility for op, compatibility in __compatible_ops__})
Copy link
Collaborator

@delock delock Apr 25, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This reflection mechanism better be lazy initialized. Otherwise there might be circular dependence because this init function be called before __compatible_ops__ being initialized.


@abc.abstractmethod
def is_synchronized_device(self):
Expand Down Expand Up @@ -295,3 +298,9 @@ def visible_devices_envs(self):
@abc.abstractmethod
def set_visible_devices_envs(self, current_env, local_accelerator_ids):
...

def get_ds_feature(self, key):
return self._ds_features[key]

def set_ds_feature(self, key, value):
self._ds_features[key] = value
31 changes: 31 additions & 0 deletions accelerator/constants.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright (c) Microsoft Corporation.
# SPDX-License-Identifier: Apache-2.0

# DeepSpeed Team

#A list of constants used in the DeepSpeed feature dictionary

OP_ASYNC_IO = "async_io"
OP_CCL_COMM = "deepspeed_ccl_comm"
OP_CPU_ADAGRAD = "cpu_adagrad"
OP_CPU_ADAM = "cpu_adam"
OP_CPU_LION = "cpu_lion"
OP_EVOFORMER_ATTN = "evoformer_attn"
OP_FP_QUANTIZER = "fp_quantizer"
OP_FUSED_ADAM = "fused_adam"
OP_FUSED_LAMB = "fused_lamb"
OP_FUSED_LION = "fused_lion"
OP_INFERENCE_CORE_OPS = "inference_core_ops"
OP_CUTLASS_OPS = "cutlass_ops"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A general name is needed to cover the non-cuda devices?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @rogerxfeng8 - is there a specific one you're referencing? I believe we call all devices (cuda and non-cuda) accelerators.

OP_QUANTIZER = "quantizer"
OP_RAGGED_DEVICE_OPS = "ragged_device_ops"
OP_RAGGED_OPS = "ragged_ops"
OP_RANDOM_LTD = "random_ltd"
OP_SPARSE_ATTN = "sparse_attn"
OP_SPATIAL_INFERENCE = "spatial_inference"
OP_STOCHASTIC_TRANSFORMER = "stochastic_transformer"
OP_TRANSFORMER = "transformer"
OP_TRANSFORMER_INFERENCE = "transformer_inference"
ZERO_1 = "zero1"
ZERO_2 = "zero2"
ZERO_3 = "zero3"
6 changes: 6 additions & 0 deletions accelerator/cuda_accelerator.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
import importlib

from .abstract_accelerator import DeepSpeedAccelerator
from .constants import *

# During setup stage torch may not be installed, pass on no torch will
# allow op builder related API to be executed.
try:
Expand All @@ -23,10 +25,14 @@
class CUDA_Accelerator(DeepSpeedAccelerator):

def __init__(self):
super().__init__()
self._name = 'cuda'
self._communication_backend_name = 'nccl'
if pynvml is None:
self._init_pynvml()
self.set_ds_feature(ZERO_1, True)
self.set_ds_feature(ZERO_2, True)
self.set_ds_feature(ZERO_3, True)

def _init_pynvml(self):
global pynvml
Expand Down
Loading