Ascend：Training not loaded into NPU #35785

CurtainRight · 2025-01-20T10:00:28Z

System Info

-- CANN 版本: 8.0.RC1
-- Pytorch版本:2.1.0
-- torch_npu：2.1.0.post6
-- Python 版本:3.9.21
-- 训练卡: 910B2
-- transformers:transformers 4.29.1

Who can help?

@ArthurZucker

Information

The official example scripts
My own modified scripts

Tasks

An officially supported task in the examples folder (such as GLUE/SQuAD, ...)
My own task or dataset (give details below)

Reproduction

代码如下：

`mport torch
import torch_npu

from transformers import (
Trainer,
TrainingArguments,
EvalPrediction,
AutoModelForSequenceClassification,
AutoTokenizer
)

from datasets import Dataset

import evaluate
import numpy as np

import pandas as pd

import os
MODEL_PATH = os.environ.get('MODEL_PATH', '/data/ts/ascend/model')
DATASET_PATH = os.environ.get('DATASET_PATH', '/data/ts/ascend/datasets')
BUS_MODEL_PATH = os.environ.get('BUS_MODEL_PATH', '/data/ts/ascend/bus_models')

def train(body):
# model_name = body['model_name']
# pretrain_model = body['pretrain_model']
# dataset = body['dataset']

# train_path = body['train_path']
# token_name = body['token_name']
# batch_size = body['batch_size']
# num_train_epochs = body['num_train_epochs']
# basic_mode_type = body['basic_mode_type']
# choose_dataset_size = body['choose_dataset_size']/100
model_name = 'test'#body['model_name']
pretrain_model = 'acge_text_embedding'#body['pretrain_model']
dataset = 'ray-dataset/test.csv'

batch_size = 4
num_train_epochs = 4
basic_mode_type = '1'
choose_dataset_size = 10/100

# 得到语料名
dataset_name =dataset.split('/')[-1]
dataset_path = os.path.join(DATASET_PATH, dataset_name)
# os.system(f"mc mirror --overwrite --remove oss/ray-dataset/{dataset_name} {dataset_path}")
# 标签处理
df = pd.read_csv(dataset_path)
df,num_labels,label_to_id,id_to_label = convert_label(df)
# df.to_csv(dataset_path,index=False)
# 暂时只支持csv
raw_datasets = Dataset.from_pandas(df)
# raw_train, raw_val = raw_datasets.split([1-choose_dataset_size, choose_dataset_size])

#加载模型
model_info = os.path.join(MODEL_PATH, pretrain_model) 
os.makedirs(MODEL_PATH, exist_ok=True)
# os.system(f"mc mirror --overwrite --remove oss/ray-model/{pretrain_model} {model_info}")
tokenizer = AutoTokenizer.from_pretrained(model_info)
model = AutoModelForSequenceClassification.from_pretrained(model_info,num_labels=num_labels, id2label=id_to_label, label2id=label_to_id, ignore_mismatched_sizes=True).to('npu:3')

# token化
def preprocess_function(examples):
    result = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
    result["label"] = [int(l) for l in examples["label"]]  # 假设每个标签是一个列表，取第一个元素
    # result["label"] = examples["label"]
    nput_ids = torch.tensor(result.input_ids, device='npu:3')
    token_type_ids = torch.tensor(result.token_type_ids, device='npu:3')
    attention_mask = torch.tensor(result.attention_mask, device='npu:3')
    label = torch.tensor(result.label, device='npu:3')
    result2 = {}
    result2['label'] = label
    result2['input_ids'] = input_ids
    result2['token_type_ids'] = token_type_ids
    result2['attention_mask'] = attention_mask
    
    return result2

dataset_train = raw_datasets.map(preprocess_function, batched=True)
dataset_val = raw_datasets.map(preprocess_function, batched=True)
dataset_train = dataset_train.remove_columns(['text'])
 


output_info = os.path.join(BUS_MODEL_PATH, model_name) 
training_args = TrainingArguments(
    output_dir=output_info,
    num_train_epochs=num_train_epochs,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    evaluation_strategy="steps",
    eval_steps=100,
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=50
)


trainer = Trainer(
    model=model,
    train_dataset=dataset_train,
    args=training_args,
)

print('开始训练')

result = trainer.train()

print('训练完成')

bus_model_info = os.path.join(MODEL_PATH, model_name)   
checkpoint_path = result.checkpoint.path
os.system(
    f"cp -r {checkpoint_path}/checkpoint/ {bus_model_info}")
os.system(
    f"mc cp -r {checkpoint_path}/checkpoint/ oss/ray-npu/{model_name}")

return result

def convert_label(df):
# 创建标签到ID的映射
label_to_id = {}
id_to_label = {}
id_counter = 0
# 遍历所有标签，创建映射
for label in df['label'].unique():
if label not in label_to_id:
label_to_id[label] = str(id_counter)
id_to_label[str(id_counter)] = label
id_counter += 1

# 将DataFrame中的标签替换为ID
df['label'] = df['label'].map(label_to_id)
return df,id_counter,label_to_id,id_to_label

if name == "main":
train(None)
`

The analysis：
1、cpu：
train defore

train after

2、data、model to npu

3、The operator changes during training have not reached npu

Expected behavior

Reasoning is normal until the NPU runs, but the training operator is still processed by the CPU

The text was updated successfully, but these errors were encountered:

CurtainRight added the bug label Jan 20, 2025

CurtainRight changed the title ~~Ascend硬件：没有在npu上训练~~ Ascend：Training not loaded into NPU Jan 20, 2025

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Ascend：Training not loaded into NPU #35785

Ascend：Training not loaded into NPU #35785

CurtainRight commented Jan 20, 2025

Ascend：Training not loaded into NPU #35785

Ascend：Training not loaded into NPU #35785

Comments

CurtainRight commented Jan 20, 2025

System Info

Who can help?

Information

Tasks

Reproduction

Expected behavior