环境：

阿里云弹性加速计算EAIS（GPU） 8核 32GB 显存16G 预装 ModelScope Library 预装镜像 ubuntu20.04-cuda11.3.0-py37-torch1.11.0-tf1.15.5-1.4.3

错误堆栈：

Traceback (most recent call last):
  File "D:\pyProject\NLPTest\train_base.py", line 93, in <module>
    trainer.train()
  File "C:\Users\PC2022\AppData\Roaming\Python\Python311\site-packages\modelscope\trainers\trainer.py", line 676, in train
    self.train_loop(self.train_dataloader)
  File "C:\Users\PC2022\AppData\Roaming\Python\Python311\site-packages\modelscope\trainers\trainer.py", line 1181, in train_loop
    self.invoke_hook(TrainerStages.after_train_epoch)
  File "C:\Users\PC2022\AppData\Roaming\Python\Python311\site-packages\modelscope\trainers\trainer.py", line 1328, in invoke_hook
    getattr(hook, fn_name)(self)
  File "C:\Users\PC2022\AppData\Roaming\Python\Python311\site-packages\modelscope\trainers\hooks\evaluation_hook.py", line 35, in after_train_epoch
    self.do_evaluate(trainer)
  File "C:\Users\PC2022\AppData\Roaming\Python\Python311\site-packages\modelscope\trainers\hooks\evaluation_hook.py", line 47, in do_evaluate
    eval_res = trainer.evaluate()
               ^^^^^^^^^^^^^^^^^^
  File "C:\Users\PC2022\AppData\Roaming\Python\Python311\site-packages\modelscope\trainers\trainer.py", line 763, in evaluate
    metric_values = self.evaluation_loop(self.eval_dataloader,
                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\PC2022\AppData\Roaming\Python\Python311\site-packages\modelscope\trainers\trainer.py", line 1233, in evaluation_loop
    metric_values = single_gpu_test(
                    ^^^^^^^^^^^^^^^^
  File "C:\Users\PC2022\AppData\Roaming\Python\Python311\site-packages\modelscope\trainers\utils\inference.py", line 56, in single_gpu_test
    evaluate_batch(trainer, data, metric_classes, vis_closure)
  File "C:\Users\PC2022\AppData\Roaming\Python\Python311\site-packages\modelscope\trainers\utils\inference.py", line 183, in evaluate_batch
    metric_cls.add(batch_result, data)
  File "C:\Users\PC2022\AppData\Roaming\Python\Python311\site-packages\modelscope\metrics\text_generation_metric.py", line 35, in add
    ground_truths = inputs[self.target_text]
                    ~~~~~~^^^^^^^^^^^^^^^^^^
KeyError: 'tgts'

Process finished with exit code 1

训练代码：

from torch.utils.tensorboard import SummaryWriter
from modelscope.msdatasets import MsDataset
from modelscope.trainers import build_trainer
from modelscope.metainfo import Trainers
from datasets import load_dataset


data_files = {"train": "train.csv", "test": "test.csv"}


dataset = load_dataset("csv", data_files=data_files, delimiter=",")
dataset = MsDataset(dataset)

train_dataset = MsDataset(dataset['train']).remap_columns({'text1': 'src_txt', 'text2': 'tgt_txt'}) \
    .map(lambda example: {'src_txt': example['src_txt'] + '\n'})
eval_dataset = MsDataset(dataset['test']).remap_columns({'text1': 'src_txt', 'text2': 'tgt_txt'}) \
    .map(lambda example: {'src_txt': example['src_txt'] + '\n'})


# dataset_dict = MsDataset.load('DuReader_robust-QG')
#
# train_dataset1 = dataset_dict['train']
# eval_dataset1 = dataset_dict['test']
#
# train_dataset = dataset_dict['train'].remap_columns({'text1': 'src_txt', 'text2': 'tgt_txt'}) \
#     .map(lambda example: {'src_txt': example['src_txt'] + '\n'})
# eval_dataset = dataset_dict['validation'].remap_columns({'text1': 'src_txt', 'text2': 'tgt_txt'}) \
#     .map(lambda example: {'src_txt': example['src_txt'] + '\n'})


print(train_dataset)

max_epochs = 1

tmp_dir = './gpt3_dureader'

num_warmup_steps = 200

def noam_lambda(current_step: int):
    current_step += 1
    return min(current_step**(-0.5),
               current_step * num_warmup_steps**(-1.5))

def cfg_modify_fn(cfg):
    cfg.train.lr_scheduler = {
        'type': 'LambdaLR',
        'lr_lambda': noam_lambda,
        'options': {
            'by_epoch': False
        }
    }
    cfg.train.optimizer = {'type': 'AdamW', 'lr': 1e-4}
    cfg.train.dataloader = {
        'batch_size_per_gpu': 4,
        'workers_per_gpu': 0
    }
    cfg.train.hooks.append({
        'type': 'EvaluationHook',
        'by_epoch': True,
        'interval': 1
    })
    cfg.megatron = {
        "tensor_model_parallel_size": 8
    }
    cfg.evaluation = {
        "dataloader": {
            "batch_size_per_gpu": 2,
            "workers_per_gpu": 0,
            "shuffle": False
        },
        "period": {
            "by_epoch": True,
            "interval": 1
        }
    }
    cfg.preprocessor.sequence_length = 512
    cfg.model.checkpoint_model_parallel_size = 1
    return cfg

kwargs = dict(
    model='damo/nlp_gpt3_text-generation_chinese-base',
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    max_epochs=max_epochs,
    work_dir=tmp_dir,
    cfg_modify_fn=cfg_modify_fn,
    #cfg_file='your_configuration.json', #指向自定义的configuration文件
)

trainer = build_trainer(
    name=Trainers.gpt3_trainer, default_args=kwargs)
trainer.train()

在训练模型的时候遇到 KeyError: 'tgts'

环境：

错误堆栈：

训练代码：

自然语言处理

相关文章

热门讨论

热门文章