环境配置和安装
1、本文在4*V100的阿里云ECS的环境配置下运行
2、python>=3.8
服务器连接与环境准备
# 服务器连接 (CentOS) ssh root@xxx.xxx.xxx.xxx # 可通过vscode连接 passwd # 修改root密码 lsb_release -a # 查看操作系统版本 # 安装git并配置 yum install git git config --global user.name "baichuan" git config --global user.email "baichuan@abc.com" git config --global init.defaultBranch main git config --list # 创建用户, 并设置密码(当然你也可以在root下操作) useradd -d /home/baichuan -m baichuan passwd baichuan su baichuan # 安装miniconda wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh # 一直[ENTER], yes即可 sh Miniconda3-latest-Linux-x86_64.sh # conda虚拟环境搭建 conda create --name baichuan python=3.10 conda activate baichuan # pip设置全局镜像与相关python包安装 pip config set global.index-url https://pypi.tuna.tsinghua.edu.cn/simple pip install modelscope pip install numpy pandas matplotlib scikit-learn pip install transformers datasets pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118 pip install tqdm pip install tensorboard pip install torchmetrics pip install sentencepiece pip install accelerate pip install numpy -U # Resolve torchmetrics dependencies and update numpy
git clone ModelScope,运行示例代码
#获取示例代码 git clone https://github.com/modelscope/modelscope.git cd modelscope/examples/pytorch/llm python baichuan_sft.py
以下介绍代码中的具体细节
进入python环境,获取环境基本信息,代码目录:https://github.com/modelscope/modelscope/blob/master/examples/pytorch/llm/_common.py
from _common import * device_ids = [0, 1, 2, 3] logger.info(device_ids) select_device(device_ids) seed_everything(42)
模型链接和下载
百川系列模型现已在ModelScope社区开源:
百川-7B
https://modelscope.cn/models/baichuan-inc/baichuan-7B/summary
百川-13B-Base
https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Base/summary
百川-13B-Chat
https://modelscope.cn/models/baichuan-inc/Baichuan-13B-Chat/summary
社区支持直接下载模型的repo
模型weights下载
from modelscope.hub.snapshot_download import snapshot_download model_id = 'baichuan-inc/Baichuan-13B-Base' model_dir = get_model_dir(model_id, model_revision='v1.0.1')
或者通过如下代码,实现模型下载,以及load model, tokenizer:
模型下载,load model,tokenizer
# ### Loading Model and Tokenizer WORK_DIR = "runs/baichuan_13B" LORA_TARGET_MODULES = ["W_pack"] # def get_baichuan13B_model_tokenizer(model_dir: str, load_model: bool = True): sys.path.insert(0, model_dir) from configuration_baichuan import BaichuanConfig from tokenization_baichuan import BaichuanTokenizer from modeling_baichuan import BaichuanForCausalLM model_config = BaichuanConfig.from_pretrained(model_dir) model_config.torch_dtype = torch.float16 logger.info(f'model_config: {model_config}') tokenizer = BaichuanTokenizer.from_pretrained(model_dir) model = None if load_model: model = BaichuanForCausalLM.from_pretrained( model_dir, config=model_config, device_map='auto', torch_dtype=torch.float16) # return model, tokenizer model, tokenizer = get_baichuan13B_model_tokenizer(model_dir)
设置GRADIENT_CHECKPOINTING的训练方式,可以有效降低训练显存
GRADIENT_CHECKPOINTING = True if GRADIENT_CHECKPOINTING: # baichuan13B does not implement the `get_input_embeddings` function def get_input_embeddings(self): return self.model.embed_tokens model.gradient_checkpointing_enable() model.__class__.get_input_embeddings = get_input_embeddings.__get__(model) model.enable_input_require_grads() if tokenizer.pad_token_id is None: tokenizer.pad_token_id = tokenizer.eos_token_id # logger.info(f'bos_token_id: {tokenizer.bos_token_id}, eos_token_id: {tokenizer.eos_token_id}, ' f'pad_token_id: {tokenizer.pad_token_id}')
模型推理
以Baichuan-13B-Base和Baichuan-13B-Chat为例
Baichuan-13B-base推理代码
from modelscope.utils.constant import Tasks from modelscope.pipelines import pipeline import torch from modelscope import snapshot_download, Model model_dir = snapshot_download("baichuan-inc/Baichuan-13B-Base") model = Model.from_pretrained(model_dir, device_map="balanced", trust_remote_code=True, torch_dtype=torch.float16) text_generation_zh = pipeline(task=Tasks.text_generation, model=model) text_generation_zh._model_prepare = True result_zh = text_generation_zh('今天天气是真的', min_length=10, max_length=512, num_beams=3,temperature=0.8,do_sample=False, early_stopping=True,top_k=50,top_p=0.8, repetition_penalty=1.2, length_penalty=1.2, no_repeat_ngram_size=6) print(result_zh)
Baichuan-13B-chat推理代码
import torch from modelscope import snapshot_download, Model model_dir = snapshot_download("baichuan-inc/Baichuan-13B-Chat", revision='v1.0.3') model = Model.from_pretrained(model_dir, device_map="balanced", trust_remote_code=True, torch_dtype=torch.float16) messages = [] messages.append({"role": "user", "content": "世界上第二高的山峰是哪一座?"}) response = model(messages) print(response)
数据集链接及下载
社区推荐数据集为:alpaca-gpt4-data-zh,alpaca-gpt4-data-en。该数据集目前也在魔搭社区上开源了,可以使用ModelScope SDK直接下载数据集:
from modelscope import MsDataset dataset_zh = MsDataset.load("AI-ModelScope/alpaca-gpt4-data-zh", split="train") dataset_en = MsDataset.load("AI-ModelScope/alpaca-gpt4-data-en", split="train") print(len(dataset_zh["instruction"])) print(len(dataset_en["instruction"])) print(dataset_zh[0]) """Out 48818 52002 {'instruction': '保持健康的三个提示。', 'input': None, 'output': '以下是保持健康的三个提示:\n\n1. 保持身体活动。每天做适当的身体运动,如散步、跑步或游泳,能促进心血管健康,增强肌肉力量,并有助于减少体重。\n\n2. 均衡饮食。每天食用新鲜的蔬菜、水果、全谷物和脂肪含量低的蛋白质食物,避免高糖、高脂肪和加工食品,以保持健康的饮食习惯。\n\n3. 睡眠充足。睡眠对人体健康至关重要,成年人每天应保证 7-8 小时的睡眠。良好的睡眠有助于减轻压力,促进身体恢复,并提高注意力和记忆力。'} """
模型训练最佳实践
微调过程分为如下几步:
- prompt的设计
- 使用ModelScope提供的微调方法构建最终模型
- 使用ModelScope提供的Trainer对模型进行微调
准备Lora:
# ### Preparing lora LORA_RANK = 8 LORA_ALPHA = 32 LORA_DROPOUT_P = 0.1 lora_config = LoRAConfig( replace_modules=LORA_TARGET_MODULES, rank=LORA_RANK, lora_alpha=LORA_ALPHA, lora_dropout=LORA_DROPOUT_P) logger.info(f'lora_config: {lora_config}') Swift.prepare_model(model, lora_config) # show_freeze_layers(model) print_model_info(model) _p = list(model.parameters())[100] logger.info(f'device: {_p.device}, dtype: {_p.dtype}') model.bfloat16()
导入datasets
# ### Loading Dataset tokenize_function = partial(tokenize_function, tokenizer=tokenizer) train_dataset, val_dataset = get_alpaca_en_zh_dataset(tokenize_function) # Data analysis stat_dataset(train_dataset) stat_dataset(val_dataset) data_collate_fn = partial(data_collate_fn, tokenizer=tokenizer) print_examples(train_dataset[0], tokenizer)
配置Config
# Setting Config cfg_file = os.path.join(model_dir, 'configuration.json') # BATCH_SIZE = 1 MAX_EPOCHS = 1 T_max = get_T_max(len(train_dataset), BATCH_SIZE, MAX_EPOCHS, True) WORK_DIR = get_work_dir(WORK_DIR) EVAL_INTERVAL = 200 CONFIG = Config({ 'train': { 'dataloader': { 'batch_size_per_gpu': BATCH_SIZE, 'workers_per_gpu': 1, 'shuffle': True, 'drop_last': True, 'pin_memory': True }, 'max_epochs': MAX_EPOCHS, 'work_dir': WORK_DIR, 'optimizer': { 'type': 'AdamW', 'lr': 1e-4, 'weight_decay': 0.01, 'options': { 'cumulative_iters': 16, 'grad_clip': { 'norm_type': 2, 'max_norm': 2.0 } } }, 'lr_scheduler': { 'type': 'CosineAnnealingLR', 'T_max': T_max, 'eta_min': 1e-5, 'options': { 'by_epoch': False, 'warmup': { 'type': 'LinearWarmup', 'warmup_ratio': 0.1, 'warmup_iters': 200 } } }, 'hooks': [ {'type': 'CheckpointHook', 'by_epoch': False, 'interval': EVAL_INTERVAL, 'max_checkpoint_num': 1}, {'type': 'EvaluationHook', 'by_epoch': False, 'interval': EVAL_INTERVAL}, {'type': 'BestCkptSaverHook', 'metric_key': 'acc', 'save_best': True, 'rule': 'max', 'max_checkpoint_num': 1}, {'type': 'TextLoggerHook', 'by_epoch': True, # Whether EpochBasedTrainer is used 'interval': 5}, {'type': 'TensorboardHook', 'by_epoch': False, 'interval': 5} ] }, 'evaluation': { 'dataloader': { 'batch_size_per_gpu': BATCH_SIZE, 'workers_per_gpu': 1, 'shuffle': False, 'drop_last': False, 'pin_memory': True }, 'metrics': [ {'type': 'my_metric', 'vocab_size': tokenizer.vocab_size} ] } })
开启微调
# ### Finetuning def cfg_modify_fn(cfg: Config) -> Config: cfg.update(CONFIG) return cfg trainer = EpochBasedTrainer( model=model, cfg_file=cfg_file, data_collator=data_collate_fn, train_dataset=train_dataset, eval_dataset=val_dataset, remove_unused_data=True, seed=42, device='cpu', # No placement for model, leave the model to `device_map` cfg_modify_fn=cfg_modify_fn, ) trainer.train()
可视化
Tensorboard 命令: (e.g.)
tensorboard --logdir /home/baichuan/my_git/modelscope/runs/baichuan_13B/v0-20230711-172449 --port 6006
# ### Visualization tb_dir = os.path.join(WORK_DIR, 'tensorboard_output') plot_image(tb_dir, ['loss'], 0.9)
资源消耗
Baichuan-13B-base用lora的方式训练的显存占用如下,大约为40G
推理结果
import sys import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextStreamer from modelscope import snapshot_download from modelscope.swift import LoRAConfig, Swift LORA_CKPT_FPATH = '/path/to/your/xxx.pth' # model_dir = snapshot_download('baichuan-inc/Baichuan-13B-Base', 'v1.0.1') sys.path.insert(0, model_dir) tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True) model = AutoModelForCausalLM.from_pretrained( model_dir, device_map='auto', torch_dtype=torch.float16, trust_remote_code=True) model.bfloat16() # Consistent with training # lora_config = LoRAConfig( replace_modules=['W_pack'], rank=32, lora_alpha=8, lora_dropout=0, pretrained_weights=LORA_CKPT_FPATH) Swift.prepare_model(model, lora_config) # streamer = TextStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) PROMPT = """### 用户 {instruction} ### AI助手 <s>""" instruction = '今天睡眠不好怎么办?' text = PROMPT.format(instruction=instruction) input_ids = tokenizer(text, return_tensors='pt')['input_ids'].to('cuda') generate_ids = model.generate( input_ids=input_ids, max_new_tokens=512, streamer=streamer, pad_token_id=tokenizer.pad_token_id, temperature=0.7, top_k=50, do_sample=True)
开源代码链接:https://github.com/modelscope/modelscope/blob/master/examples/pytorch/llm/baichuan_sft.py