Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ascend:Training not loaded into NPU #35785

Open
2 of 4 tasks
CurtainRight opened this issue Jan 20, 2025 · 0 comments
Open
2 of 4 tasks

Ascend:Training not loaded into NPU #35785

CurtainRight opened this issue Jan 20, 2025 · 0 comments
Labels

Comments

@CurtainRight
Copy link

System Info

-- CANN 版本: 8.0.RC1
-- Pytorch版本:2.1.0
-- torch_npu:2.1.0.post6
-- Python 版本:3.9.21
-- 训练卡: 910B2
-- transformers:transformers 4.29.1

Who can help?

@ArthurZucker

Information

  • The official example scripts
  • My own modified scripts

Tasks

  • An officially supported task in the examples folder (such as GLUE/SQuAD, ...)
  • My own task or dataset (give details below)

Reproduction

代码如下:

`mport torch
import torch_npu

from transformers import (
Trainer,
TrainingArguments,
EvalPrediction,
AutoModelForSequenceClassification,
AutoTokenizer
)

from datasets import Dataset

import evaluate
import numpy as np

import pandas as pd

import os
MODEL_PATH = os.environ.get('MODEL_PATH', '/data/ts/ascend/model')
DATASET_PATH = os.environ.get('DATASET_PATH', '/data/ts/ascend/datasets')
BUS_MODEL_PATH = os.environ.get('BUS_MODEL_PATH', '/data/ts/ascend/bus_models')

def train(body):
# model_name = body['model_name']
# pretrain_model = body['pretrain_model']
# dataset = body['dataset']

# train_path = body['train_path']
# token_name = body['token_name']
# batch_size = body['batch_size']
# num_train_epochs = body['num_train_epochs']
# basic_mode_type = body['basic_mode_type']
# choose_dataset_size = body['choose_dataset_size']/100
model_name = 'test'#body['model_name']
pretrain_model = 'acge_text_embedding'#body['pretrain_model']
dataset = 'ray-dataset/test.csv'

batch_size = 4
num_train_epochs = 4
basic_mode_type = '1'
choose_dataset_size = 10/100

# 得到语料名
dataset_name =dataset.split('/')[-1]
dataset_path = os.path.join(DATASET_PATH, dataset_name)
# os.system(f"mc mirror --overwrite --remove oss/ray-dataset/{dataset_name} {dataset_path}")
# 标签处理
df = pd.read_csv(dataset_path)
df,num_labels,label_to_id,id_to_label = convert_label(df)
# df.to_csv(dataset_path,index=False)
# 暂时只支持csv
raw_datasets = Dataset.from_pandas(df)
# raw_train, raw_val = raw_datasets.split([1-choose_dataset_size, choose_dataset_size])

#加载模型
model_info = os.path.join(MODEL_PATH, pretrain_model) 
os.makedirs(MODEL_PATH, exist_ok=True)
# os.system(f"mc mirror --overwrite --remove oss/ray-model/{pretrain_model} {model_info}")
tokenizer = AutoTokenizer.from_pretrained(model_info)
model = AutoModelForSequenceClassification.from_pretrained(model_info,num_labels=num_labels, id2label=id_to_label, label2id=label_to_id, ignore_mismatched_sizes=True).to('npu:3')

# token化
def preprocess_function(examples):
    result = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
    result["label"] = [int(l) for l in examples["label"]]  # 假设每个标签是一个列表,取第一个元素
    # result["label"] = examples["label"]
    nput_ids = torch.tensor(result.input_ids, device='npu:3')
    token_type_ids = torch.tensor(result.token_type_ids, device='npu:3')
    attention_mask = torch.tensor(result.attention_mask, device='npu:3')
    label = torch.tensor(result.label, device='npu:3')
    result2 = {}
    result2['label'] = label
    result2['input_ids'] = input_ids
    result2['token_type_ids'] = token_type_ids
    result2['attention_mask'] = attention_mask
    
    return result2

dataset_train = raw_datasets.map(preprocess_function, batched=True)
dataset_val = raw_datasets.map(preprocess_function, batched=True)
dataset_train = dataset_train.remove_columns(['text'])
 


output_info = os.path.join(BUS_MODEL_PATH, model_name) 
training_args = TrainingArguments(
    output_dir=output_info,
    num_train_epochs=num_train_epochs,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50
)


trainer = Trainer(
    model=model,
    train_dataset=dataset_train,
    args=training_args,
)

print('开始训练')

result = trainer.train()

print('训练完成')

bus_model_info = os.path.join(MODEL_PATH, model_name)   
checkpoint_path = result.checkpoint.path
os.system(
    f"cp -r {checkpoint_path}/checkpoint/ {bus_model_info}")
os.system(
    f"mc cp -r {checkpoint_path}/checkpoint/ oss/ray-npu/{model_name}")

return result

def convert_label(df):
# 创建标签到ID的映射
label_to_id = {}
id_to_label = {}
id_counter = 0
# 遍历所有标签,创建映射
for label in df['label'].unique():
if label not in label_to_id:
label_to_id[label] = str(id_counter)
id_to_label[str(id_counter)] = label
id_counter += 1

# 将DataFrame中的标签替换为ID
df['label'] = df['label'].map(label_to_id)
return df,id_counter,label_to_id,id_to_label

if name == "main":
train(None)
`

The analysis:
1、cpu:
train defore
Image

train after
Image

2、data、model to npu

Image

Image

3、The operator changes during training have not reached npu
Image

Expected behavior

Reasoning is normal until the NPU runs, but the training operator is still processed by the CPU

@CurtainRight CurtainRight changed the title Ascend硬件:没有在npu上训练 Ascend:Training not loaded into NPU Jan 20, 2025
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
Projects
None yet
Development

No branches or pull requests

1 participant