import tempfile
from modelscope.msdatasets import MsDataset
from modelscope.metainfo import Trainers
from modelscope.trainers import build_trainer
my_csv_1 = 'results/re1.csv'
my_csv_2 = 'results/sam.csv'
ds = MsDataset.load('csv', data_files={'train': [my_csv_1], 'test': [my_csv_2]})
train_dataset = ds['train'].remap_columns({'input_text': 'src_txt', 'category': 'tgt_txt'})
eval_dataset = ds['test'].remap_columns({'input_text': 'src_txt', 'category': 'tgt_txt'})
num_warmup_steps = 500
def noam_lambda(current_step: int):
current_step += 1
return min(current_step(-0.5),
current_step * num_warmup_steps(-1.5))
def cfg_modify_fn(cfg):
cfg.train.lr_scheduler = {
'type': 'LambdaLR',
'lr_lambda': noam_lambda,
'options': {
'by_epoch': False
}
}
cfg.train.optimizer = {
"type": "AdamW",
"lr": 1e-3,
"options": {}
}
cfg.train.max_epochs = 15
cfg.train.dataloader = {
"batch_size_per_gpu": 8,
"workers_per_gpu": 1
}
return cfg
kwargs = dict(
model='F:/study/graduationProject/openingReport/nlp_mt5_zero-shot-augment_chinese-base',
model_revision="master",
train_dataset=train_dataset,
eval_dataset=eval_dataset,
work_dir=tempfile.TemporaryDirectory().name,
cfg_modify_fn=cfg_modify_fn)
trainer = build_trainer(
name=Trainers.text_generation_trainer, default_args=kwargs)
trainer.train()
ModelScope这个代码对模型nlp_mt5_zero-shot-augment_chinese-base进行微调,使用的是文本分类的数据集,报错是:Traceback (most recent call last):
File "D:\tool\Anaconda\anaconda3\envs\modelscope\lib\site-packages\datasets\builder.py", line 1973, in _prepare_splitsingle
for , table in generator:
File "D:\tool\Anaconda\anaconda3\envs\modelscope\lib\site-packages\datasets\packaged_modules\csv\csv.py", line 185, in _generate_tables
csv_file_reader = pd.read_csv(file, iterator=True, dtype=dtype, self.config.pd_read_csv_kwargs)
File "D:\tool\Anaconda\anaconda3\envs\modelscope\lib\site-packages\datasets\streaming.py", line 75, in wrapper
return function(*args, download_config=download_config, kwargs)
File "D:\tool\Anaconda\anaconda3\envs\modelscope\lib\site-packages\datasets\download\streaming_download_manager.py", line 778, in xpandas_read_csv
return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), kwargs)
File "D:\tool\Anaconda\anaconda3\envs\modelscope\lib\site-packages\pandas\io\parsers\readers.py", line 912, in read_csv
return _read(filepath_or_buffer, kwds)
File "D:\tool\Anaconda\anaconda3\envs\modelscope\lib\site-packages\pandas\io\parsers\readers.py", line 577, in _read
parser = TextFileReader(filepath_or_buffer, kwds)
File "D:\tool\Anaconda\anaconda3\envs\modelscope\lib\site-packages\pandas\io\parsers\readers.py", line 1407, in init
self._engine = self._make_engine(f, self.engine)
File "D:\tool\Anaconda\anaconda3\envs\modelscope\lib\site-packages\pandas\io\parsers\readers.py", line 1679, in _make_engine
return mapping engine ](f, self.options)
File "D:\tool\Anaconda\anaconda3\envs\modelscope\lib\site-packages\pandas\io\parsers\c_parser_wrapper.py", line 93, in init
self._reader = parsers.TextReader(src, kwds)
File "pandas_libs\parsers.pyx", line 550, in pandas._libs.parsers.TextReader.cinit
File "pandas_libs\parsers.pyx", line 639, in pandas._libs.parsers.TextReader._get_header
File "pandas_libs\parsers.pyx", line 850, in pandas._libs.parsers.TextReader._tokenize_rows
File "pandas_libs\parsers.pyx", line 861, in pandas._libs.parsers.TextReader._check_tokenize_status
File "pandas_libs\parsers.pyx", line 2021, in pandas._libs.parsers.raise_parser_error
UnicodeDecodeError: 'utf-8' codec can't decode byte 0xd5 in position 21: invalid continuation byte
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "F:\study\graduationProject\vue2_mt5\vue_flask\finetune.py", line 17, in
ds = MsDataset.load('csv', data_files={'train': [my_csv_1], 'test': [my_csv_2]})
File "D:\tool\Anaconda\anaconda3\envs\modelscope\lib\site-packages\modelscope\msdatasets\ms_dataset.py", line 258, in load
dataset_inst = LocalDataLoaderManager(
File "D:\tool\Anaconda\anaconda3\envs\modelscope\lib\site-packages\modelscope\msdatasets\data_loader\data_loader_manager.py", line 74, in load_dataset
return hf_data_loader(
File "D:\tool\Anaconda\anaconda3\envs\modelscope\lib\site-packages\datasets\load.py", line 2549, in load_dataset
builder_instance.download_and_prepare(
File "D:\tool\Anaconda\anaconda3\envs\modelscope\lib\site-packages\datasets\builder.py", line 1005, in download_and_prepare
self._download_and_prepare(
File "D:\tool\Anaconda\anaconda3\envs\modelscope\lib\site-packages\datasets\builder.py", line 1100, in _download_and_prepare
self._prepare_split(split_generator, **prepare_split_kwargs)
File "D:\tool\Anaconda\anaconda3\envs\modelscope\lib\site-packages\datasets\builder.py", line 1860, in _prepare_split
for job_id, done, content in self._prepare_split_single(
File "D:\tool\Anaconda\anaconda3\envs\modelscope\lib\site-packages\datasets\builder.py", line 2016, in _prepare_split_single
raise DatasetGenerationError("An error occurred while generating the dataset") from e
datasets.exceptions.DatasetGenerationError: An error occurred while generating the dataset
Process finished with exit code 1
请问怎么修改
my_csv = '/path/to/my_file.csv'
train_ds = MsDataset.load('csv', data_files=[my_csv])
print(next(iter(ds)))
https://github.com/google-research/timesfm?tab=readme-ov-file
,对这个代码库不熟悉哦,您看下能否指定本地路径加载 ,此回答整理自钉群“魔搭ModelScope开发者联盟群 ①”
使用 Python 库如 chardet 自动检测和指定文件的编码。
在加载 CSV 文件时,你可以传递额外的参数给 pd.read_csv。由于 MsDataset.load 方法使用了 datasets 库的 load_dataset 函数,你可能需要查看 datasets 库的相关文档来了解如何传递这些参数。
ModelScope旨在打造下一代开源的模型即服务共享平台,为泛AI开发者提供灵活、易用、低成本的一站式模型服务产品,让模型应用更简单!欢迎加入技术交流群:微信公众号:魔搭ModelScope社区,钉钉群号:44837352