We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
-- CANN 版本: 8.0.RC1 -- Pytorch版本:2.1.0 -- torch_npu:2.1.0.post6 -- Python 版本:3.9.21 -- 训练卡: 910B2 -- transformers:transformers 4.29.1
@ArthurZucker
examples
代码如下:
`mport torch import torch_npu
from transformers import ( Trainer, TrainingArguments, EvalPrediction, AutoModelForSequenceClassification, AutoTokenizer )
from datasets import Dataset
import evaluate import numpy as np
import pandas as pd
import os MODEL_PATH = os.environ.get('MODEL_PATH', '/data/ts/ascend/model') DATASET_PATH = os.environ.get('DATASET_PATH', '/data/ts/ascend/datasets') BUS_MODEL_PATH = os.environ.get('BUS_MODEL_PATH', '/data/ts/ascend/bus_models')
def train(body): # model_name = body['model_name'] # pretrain_model = body['pretrain_model'] # dataset = body['dataset']
# train_path = body['train_path'] # token_name = body['token_name'] # batch_size = body['batch_size'] # num_train_epochs = body['num_train_epochs'] # basic_mode_type = body['basic_mode_type'] # choose_dataset_size = body['choose_dataset_size']/100 model_name = 'test'#body['model_name'] pretrain_model = 'acge_text_embedding'#body['pretrain_model'] dataset = 'ray-dataset/test.csv' batch_size = 4 num_train_epochs = 4 basic_mode_type = '1' choose_dataset_size = 10/100 # 得到语料名 dataset_name =dataset.split('/')[-1] dataset_path = os.path.join(DATASET_PATH, dataset_name) # os.system(f"mc mirror --overwrite --remove oss/ray-dataset/{dataset_name} {dataset_path}") # 标签处理 df = pd.read_csv(dataset_path) df,num_labels,label_to_id,id_to_label = convert_label(df) # df.to_csv(dataset_path,index=False) # 暂时只支持csv raw_datasets = Dataset.from_pandas(df) # raw_train, raw_val = raw_datasets.split([1-choose_dataset_size, choose_dataset_size]) #加载模型 model_info = os.path.join(MODEL_PATH, pretrain_model) os.makedirs(MODEL_PATH, exist_ok=True) # os.system(f"mc mirror --overwrite --remove oss/ray-model/{pretrain_model} {model_info}") tokenizer = AutoTokenizer.from_pretrained(model_info) model = AutoModelForSequenceClassification.from_pretrained(model_info,num_labels=num_labels, id2label=id_to_label, label2id=label_to_id, ignore_mismatched_sizes=True).to('npu:3') # token化 def preprocess_function(examples): result = tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512) result["label"] = [int(l) for l in examples["label"]] # 假设每个标签是一个列表,取第一个元素 # result["label"] = examples["label"] nput_ids = torch.tensor(result.input_ids, device='npu:3') token_type_ids = torch.tensor(result.token_type_ids, device='npu:3') attention_mask = torch.tensor(result.attention_mask, device='npu:3') label = torch.tensor(result.label, device='npu:3') result2 = {} result2['label'] = label result2['input_ids'] = input_ids result2['token_type_ids'] = token_type_ids result2['attention_mask'] = attention_mask return result2 dataset_train = raw_datasets.map(preprocess_function, batched=True) dataset_val = raw_datasets.map(preprocess_function, batched=True) dataset_train = dataset_train.remove_columns(['text']) output_info = os.path.join(BUS_MODEL_PATH, model_name) training_args = TrainingArguments( output_dir=output_info, num_train_epochs=num_train_epochs, learning_rate=2e-5, per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size, evaluation_strategy="steps", eval_steps=100, save_strategy="epoch", logging_strategy="steps", logging_steps=50 ) trainer = Trainer( model=model, train_dataset=dataset_train, args=training_args, ) print('开始训练') result = trainer.train() print('训练完成') bus_model_info = os.path.join(MODEL_PATH, model_name) checkpoint_path = result.checkpoint.path os.system( f"cp -r {checkpoint_path}/checkpoint/ {bus_model_info}") os.system( f"mc cp -r {checkpoint_path}/checkpoint/ oss/ray-npu/{model_name}") return result
def convert_label(df): # 创建标签到ID的映射 label_to_id = {} id_to_label = {} id_counter = 0 # 遍历所有标签,创建映射 for label in df['label'].unique(): if label not in label_to_id: label_to_id[label] = str(id_counter) id_to_label[str(id_counter)] = label id_counter += 1
# 将DataFrame中的标签替换为ID df['label'] = df['label'].map(label_to_id) return df,id_counter,label_to_id,id_to_label
if name == "main": train(None) `
The analysis: 1、cpu: train defore
train after
2、data、model to npu
3、The operator changes during training have not reached npu
Reasoning is normal until the NPU runs, but the training operator is still processed by the CPU
The text was updated successfully, but these errors were encountered:
No branches or pull requests
System Info
-- CANN 版本: 8.0.RC1
-- Pytorch版本:2.1.0
-- torch_npu:2.1.0.post6
-- Python 版本:3.9.21
-- 训练卡: 910B2
-- transformers:transformers 4.29.1
Who can help?
@ArthurZucker
Information
Tasks
examples
folder (such as GLUE/SQuAD, ...)Reproduction
代码如下:
`mport torch
import torch_npu
from transformers import (
Trainer,
TrainingArguments,
EvalPrediction,
AutoModelForSequenceClassification,
AutoTokenizer
)
from datasets import Dataset
import evaluate
import numpy as np
import pandas as pd
import os
MODEL_PATH = os.environ.get('MODEL_PATH', '/data/ts/ascend/model')
DATASET_PATH = os.environ.get('DATASET_PATH', '/data/ts/ascend/datasets')
BUS_MODEL_PATH = os.environ.get('BUS_MODEL_PATH', '/data/ts/ascend/bus_models')
def train(body):
# model_name = body['model_name']
# pretrain_model = body['pretrain_model']
# dataset = body['dataset']
def convert_label(df):
# 创建标签到ID的映射
label_to_id = {}
id_to_label = {}
id_counter = 0
# 遍历所有标签,创建映射
for label in df['label'].unique():
if label not in label_to_id:
label_to_id[label] = str(id_counter)
id_to_label[str(id_counter)] = label
id_counter += 1
if name == "main":
train(None)
`
The analysis:
1、cpu:
train defore
train after
2、data、model to npu
3、The operator changes during training have not reached npu
Expected behavior
Reasoning is normal until the NPU runs, but the training operator is still processed by the CPU
The text was updated successfully, but these errors were encountered: