01引言
近期Internvl2.5发布,性能与GPT-4o和Claude-3.5-sonnet等领先的商业模型相媲美,成为首个在MMMU上超过70%的开源模型,通过链式思考(CoT)推理实现了3.7个百分点的提升,展示了强大的测试时间可扩展性潜力。InternVL 2.5是基于InternVL 2.0发展而来,通过增强训练和测试策略以及提高数据质量来进一步提升性能。该模型在多个方面进行了优化,包括视觉编码器、语言模型、数据集大小和测试时间配置等方面的研究,旨在探索模型规模与性能之间的关系。InternVL 2.5经过广泛的评估,在多项基准测试中展现了竞争性的性能,特别是在多学科推理、文档理解、多图像/视频理解、现实世界理解、多模态幻觉检测、视觉地面化、多语言能力以及纯语言处理等领域。
📕 课代表划重点:
1. InternVL 2.5的开源多模态大型语言模型发布,通过数据和测试时间扩展提高了性能边界
2. 实验表明,InternVL 2.5在多种基准测试中表现出竞争力,并超越了商业模型GPT-4o和Claude-3.5-Sonnet
3. 该模型采用了新的训练和测试策略以及高质量的数据集,能够处理多种模态的信息,包括文本、图像和视频等
4. 通过链式思维推理等方式,该模型在MMMU基准测试中实现了超过70%的准确率,展现了强大的测试时间扩展潜力
5. 该研究为开放源代码社区提供了一个新标准,用于开发和应用多模态AI系统
InternVL 2.5保留了其前身的相同模型架构:InternVL 1.5 和 InternVL 2.0 ,遵循各种 MLLM 研究中广泛采用的“ViT-MLP-LLM”范式。InternVL 2.5实现将一种新的增量预训练的InternViT-6B或InternViT-300M与各种不同大小和类型的预先训练的LLMs集成在一起,包括InternLM2.5和Qwen 2.5,使用随机初始化的两层MLP投影器。正如之前的版本一样,为了增强高分辨率处理的可扩展性,研究团队简单地应用了一个像素无序操作,将视觉令牌的数量减少到原始数量的一半。因此,在InternVL 2.5中,一个448×448图像块由256个视觉令牌表示。
模型链接:
https://www.modelscope.cn/collections/InternVL-25-fbde6e47302942
02模型下载
命令行下载:
modelscope download --model OpenGVLab/InternVL2_5-4B
Python SDK下载:
#模型下载 from modelscope import snapshot_download model_dir = snapshot_download('OpenGVLab/InternVL2_5-4B')
03模型推理
transformers推理
import numpy as np import torch import torchvision.transforms as T from decord import VideoReader, cpu from PIL import Image from torchvision.transforms.functional import InterpolationMode from modelscope import AutoModel, AutoTokenizer IMAGENET_MEAN = (0.485, 0.456, 0.406) IMAGENET_STD = (0.229, 0.224, 0.225) def build_transform(input_size): MEAN, STD = IMAGENET_MEAN, IMAGENET_STD transform = T.Compose([ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), T.ToTensor(), T.Normalize(mean=MEAN, std=STD) ]) return transform def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): best_ratio_diff = float('inf') best_ratio = (1, 1) area = width * height for ratio in target_ratios: target_aspect_ratio = ratio[0] / ratio[1] ratio_diff = abs(aspect_ratio - target_aspect_ratio) if ratio_diff < best_ratio_diff: best_ratio_diff = ratio_diff best_ratio = ratio elif ratio_diff == best_ratio_diff: if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: best_ratio = ratio return best_ratio def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False): orig_width, orig_height = image.size aspect_ratio = orig_width / orig_height # calculate the existing image aspect ratio target_ratios = set( (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if i * j <= max_num and i * j >= min_num) target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) # find the closest aspect ratio to the target target_aspect_ratio = find_closest_aspect_ratio( aspect_ratio, target_ratios, orig_width, orig_height, image_size) # calculate the target width and height target_width = image_size * target_aspect_ratio[0] target_height = image_size * target_aspect_ratio[1] blocks = target_aspect_ratio[0] * target_aspect_ratio[1] # resize the image resized_img = image.resize((target_width, target_height)) processed_images = [] for i in range(blocks): box = ( (i % (target_width // image_size)) * image_size, (i // (target_width // image_size)) * image_size, ((i % (target_width // image_size)) + 1) * image_size, ((i // (target_width // image_size)) + 1) * image_size ) # split the image split_img = resized_img.crop(box) processed_images.append(split_img) assert len(processed_images) == blocks if use_thumbnail and len(processed_images) != 1: thumbnail_img = image.resize((image_size, image_size)) processed_images.append(thumbnail_img) return processed_images def load_image(image_file, input_size=448, max_num=12): image = Image.open(image_file).convert('RGB') transform = build_transform(input_size=input_size) images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num) pixel_values = [transform(image) for image in images] pixel_values = torch.stack(pixel_values) return pixel_values # If you want to load a model using multiple GPUs, please refer to the `Multiple GPUs` section. path = 'OpenGVLab/InternVL2_5-4B' model = AutoModel.from_pretrained( path, torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, use_flash_attn=True, trust_remote_code=True).eval().cuda() tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False) # set the max number of tiles in `max_num` pixel_values = load_image('./awesome.png', max_num=12).to(torch.bfloat16).cuda() generation_config = dict(max_new_tokens=1024, do_sample=True) # pure-text conversation (纯文本对话) question = 'Hello, who are you?' response, history = model.chat(tokenizer, None, question, generation_config, history=None, return_history=True) print(f'User: {question}\nAssistant: {response}') question = 'Can you tell me a story?' response, history = model.chat(tokenizer, None, question, generation_config, history=history, return_history=True) print(f'User: {question}\nAssistant: {response}') # single-image single-round conversation (单图单轮对话) question = '<image>\nPlease describe the image shortly.' response = model.chat(tokenizer, pixel_values, question, generation_config) print(f'User: {question}\nAssistant: {response}') # single-image multi-round conversation (单图多轮对话) question = '<image>\nPlease describe the image in detail.' response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True) print(f'User: {question}\nAssistant: {response}') question = 'Please write a poem according to the image.' response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True) print(f'User: {question}\nAssistant: {response}') # multi-image multi-round conversation, combined images (多图多轮对话,拼接图像) pixel_values1 = load_image('./awesome.png', max_num=12).to(torch.bfloat16).cuda() pixel_values2 = load_image('./noword.jpg', max_num=12).to(torch.bfloat16).cuda() pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0) question = '<image>\nDescribe the two images in detail.' response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=None, return_history=True) print(f'User: {question}\nAssistant: {response}') question = 'What are the similarities and differences between these two images.' response, history = model.chat(tokenizer, pixel_values, question, generation_config, history=history, return_history=True) print(f'User: {question}\nAssistant: {response}') # multi-image multi-round conversation, separate images (多图多轮对话,独立图像) pixel_values1 = load_image('./awesome.png', max_num=12).to(torch.bfloat16).cuda() pixel_values2 = load_image('./noword.jpg', max_num=12).to(torch.bfloat16).cuda() pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0) num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)] question = 'Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.' response, history = model.chat(tokenizer, pixel_values, question, generation_config, num_patches_list=num_patches_list, history=None, return_history=True) print(f'User: {question}\nAssistant: {response}') question = 'What are the similarities and differences between these two images.' response, history = model.chat(tokenizer, pixel_values, question, generation_config, num_patches_list=num_patches_list, history=history, return_history=True) print(f'User: {question}\nAssistant: {response}') # batch inference, single image per sample (单图批处理) pixel_values1 = load_image('./awesome.png', max_num=12).to(torch.bfloat16).cuda() pixel_values2 = load_image('./noword.jpg', max_num=12).to(torch.bfloat16).cuda() num_patches_list = [pixel_values1.size(0), pixel_values2.size(0)] pixel_values = torch.cat((pixel_values1, pixel_values2), dim=0) questions = ['<image>\nDescribe the image in detail.'] * len(num_patches_list) responses = model.batch_chat(tokenizer, pixel_values, num_patches_list=num_patches_list, questions=questions, generation_config=generation_config) for question, response in zip(questions, responses): print(f'User: {question}\nAssistant: {response}') # video multi-round conversation (视频多轮对话) def get_index(bound, fps, max_frame, first_idx=0, num_segments=32): if bound: start, end = bound[0], bound[1] else: start, end = -100000, 100000 start_idx = max(first_idx, round(start * fps)) end_idx = min(round(end * fps), max_frame) seg_size = float(end_idx - start_idx) / num_segments frame_indices = np.array([ int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) for idx in range(num_segments) ]) return frame_indices def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32): vr = VideoReader(video_path, ctx=cpu(0), num_threads=1) max_frame = len(vr) - 1 fps = float(vr.get_avg_fps()) pixel_values_list, num_patches_list = [], [] transform = build_transform(input_size=input_size) frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments) for frame_index in frame_indices: img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB') img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num) pixel_values = [transform(tile) for tile in img] pixel_values = torch.stack(pixel_values) num_patches_list.append(pixel_values.shape[0]) pixel_values_list.append(pixel_values) pixel_values = torch.cat(pixel_values_list) return pixel_values, num_patches_list video_path = './showcase.mp4' pixel_values, num_patches_list = load_video(video_path, num_segments=8, max_num=1) pixel_values = pixel_values.to(torch.bfloat16).cuda() video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))]) question = video_prefix + 'What is the red panda doing?' # Frame1: <image>\nFrame2: <image>\n...\nFrame8: <image>\n{question} response, history = model.chat(tokenizer, pixel_values, question, generation_config, num_patches_list=num_patches_list, history=None, return_history=True) print(f'User: {question}\nAssistant: {response}') question = 'Describe this video in detail. Don\'t repeat.' response, history = model.chat(tokenizer, pixel_values, question, generation_config, num_patches_list=num_patches_list, history=history, return_history=True) print(f'User: {question}\nAssistant: {response}')
流式输出:
from transformers import TextIteratorStreamer from threading import Thread # Initialize the streamer streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=10) # Define the generation configuration generation_config = dict(max_new_tokens=1024, do_sample=False, streamer=streamer) # Start the model chat in a separate thread thread = Thread(target=model.chat, kwargs=dict( tokenizer=tokenizer, pixel_values=pixel_values, question=question, history=None, return_history=False, generation_config=generation_config, )) thread.start() # Initialize an empty string to store the generated text generated_text = '' # Loop through the streamer to get the new text as it is generated for new_text in streamer: if new_text == model.conv_template.sep: break generated_text += new_text print(new_text, end='', flush=True) # Print each new chunk of generated text on the same line
显存占用:
lmdeploy推理
安装依赖:
pip install lmdeploy -U
示例代码:
from lmdeploy import pipeline, TurbomindEngineConfig from lmdeploy.vl import load_image from modelscope import snapshot_download model = snapshot_download('OpenGVLab/InternVL2_5-4B') image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg') pipe = pipeline(model, backend_config=TurbomindEngineConfig(session_len=8192)) response = pipe(('describe this image', image)) print(response.text)
lmdeploy部署本地服务:
lmdeploy serve api_server ./InternVL2_5-4B/ --backend turbomind --server-port 23333 推理服务: from openai import OpenAI client = OpenAI(api_key='YOUR_API_KEY', base_url='http://0.0.0.0:23333/v1') model_name = client.models.list().data[0].id response = client.chat.completions.create( model=model_name, messages=[{ 'role': 'user', 'content': [{ 'type': 'text', 'text': 'describe this image', }, { 'type': 'image_url', 'image_url': { 'url': 'https://modelscope.oss-cn-beijing.aliyuncs.com/resource/tiger.jpeg', }, }], }], temperature=0.8, top_p=0.8) print(response)
04 模型训练
我们使用ms-swift 3.0对InternVL2.5-2B进行微调. ms-swift是魔搭社区官方提供的LLM与多模态LLM微调部署框架,支持400+LLM和100+多模态LLM。
这里我们使用python的方式对InternVL2.5-2B进行Latex-OCR的微调。借此我们可以快速了解微调中的一些细节,这对自定义训练过程很有帮助。
如果出现兼容问题,请关注:
https://github.com/modelscope/ms-swift/tree/main/examples/train/notebook
首先我们需要从源代码安装ms-swift3.0
git clone https://github.com/modelscope/ms-swift.git cd ms-swift pip install -e '.[llm]'
首先我们需要导入一些包:
import os os.environ['CUDA_VISIBLE_DEVICES'] = '0' from swift.llm import ( get_model_tokenizer, load_dataset, get_template, EncodePreprocessor, get_model_arch, get_multimodal_target_regex, LazyLLMDataset ) from swift.utils import get_logger, get_model_parameter_info, plot_images, seed_everything from swift.tuners import Swift, LoraConfig from swift.trainers import Seq2SeqTrainer, Seq2SeqTrainingArguments from functools import partial logger = get_logger() seed_everything(42)
设置一些训练的超参数:
# model model_id_or_path = 'OpenGVLab/InternVL2_5-2B' system = None # 使用template中定义的默认system output_dir = 'output/InternVL2_5-2B' # dataset dataset = ['AI-ModelScope/LaTeX_OCR#20000'] # dataset_id或者dataset_path。这里我们采样20000条数据样本 data_seed = 42 max_length = 8192 split_dataset_ratio = 0.01 # 切分验证集的比例 num_proc = 4 # 数据处理的进程数 strict = False # lora lora_rank = 8 lora_alpha = 32 freeze_llm = False freeze_vit = True freeze_aligner = True # training_args training_args = Seq2SeqTrainingArguments( output_dir=output_dir, learning_rate=1e-4, per_device_train_batch_size=1, per_device_eval_batch_size=1, gradient_checkpointing=True, weight_decay=0.1, lr_scheduler_type='cosine', warmup_ratio=0.05, report_to=['tensorboard'], logging_first_step=True, save_strategy='steps', save_steps=100, eval_strategy='steps', eval_steps=100, gradient_accumulation_steps=16, # 为了更快查看训练效果,这里设置为1。通常情况下,需要设置为更大的数。 num_train_epochs=1, metric_for_best_model='loss', save_total_limit=2, logging_steps=5, dataloader_num_workers=4, data_seed=data_seed, remove_unused_columns=False ) output_dir = os.path.abspath(os.path.expanduser(output_dir)) logger.info(f'output_dir: {output_dir}')
准备模型和对话模板:
# 获取model和template model, processor = get_model_tokenizer(model_id_or_path) logger.info(f'model_info: {model.model_info}') template = get_template(model.model_meta.template, processor, default_system=system, max_length=max_length) template.set_mode('train') # 获取target_modules并在模型中加入可训练的LoRA模块 model_arch = get_model_arch(model.model_meta.model_arch) target_modules = get_multimodal_target_regex(model_arch, freeze_llm=freeze_llm, freeze_vit=freeze_vit, freeze_aligner=freeze_aligner) lora_config = LoraConfig(task_type='CAUSAL_LM', r=lora_rank, lora_alpha=lora_alpha, target_modules=target_modules) model = Swift.prepare_model(model, lora_config) logger.info(f'lora_config: {lora_config}') # 打印模型结构和可训练参数 logger.info(f'model: {model}') model_parameter_info = get_model_parameter_info(model) logger.info(f'model_parameter_info: {model_parameter_info}')
准备训练和验证数据集:
# 下载并载入数据集,切分成训练集和验证集 train_dataset, val_dataset = load_dataset(dataset, split_dataset_ratio=split_dataset_ratio, num_proc=num_proc, strict=strict, seed=data_seed) logger.info(f'train_dataset: {train_dataset}') logger.info(f'val_dataset: {val_dataset}') logger.info(f'train_dataset[0]: {train_dataset[0]}') # 将文本encode成tokens train_dataset = LazyLLMDataset( train_dataset, template.encode, strict=strict, random_state=data_seed) val_dataset = LazyLLMDataset( val_dataset, template.encode, strict=strict, random_state=data_seed) data = train_dataset[0] logger.info(f'encoded_train_dataset[0]: {data}') template.print_inputs(data)
使用trainer开启训练:
model.enable_input_require_grads() # 兼容gradient checkpointing template.register_post_encode_hook([model]) # 将post_encode注册到forward_pre_hook中 trainer = Seq2SeqTrainer( model=model, args=training_args, data_collator=template.data_collator, train_dataset=train_dataset, eval_dataset=val_dataset, template=template, ) trainer.model_accepts_loss_kwargs = True # 兼容transformers>=4.46 trainer.train() last_model_checkpoint = trainer.state.last_model_checkpoint logger.info(f'last_model_checkpoint: {last_model_checkpoint}')
可视化训练loss: (这里我们只训练了400个steps)
你也可以使用tensorboard在训练过程中可视化训练loss,输入以下命令:`tensorboard --logdir '{output_dir}/runs'`
images_dir = os.path.join(output_dir, 'images') logger.info(f'images_dir: {images_dir}') plot_images(images_dir, training_args.logging_dir, ['train/loss'], 0.9) # 保存训练loss图
训练后推理
导入一些包:
import os os.environ['CUDA_VISIBLE_DEVICES'] = '0' from swift.llm import ( InferEngine, InferRequest, PtEngine, RequestConfig, get_template, load_dataset, load_image ) from swift.tuners import Swift from swift.utils import get_model_parameter_info, get_logger, seed_everything logger = get_logger() seed_everything(42)
推理超参数设置:
last_model_checkpoint = 'output/InternVL2_5-2B/vx-xxx/checkpoint-xxx' # model model_id_or_path = 'OpenGVLab/InternVL2_5-2B' # model_id or model_path # dataset dataset = ['AI-ModelScope/LaTeX_OCR#20000'] data_seed = 42 split_dataset_ratio = 0.01 num_proc = 4 strict = False # generation_config max_new_tokens = 512 temperature = 0
我们使用infer_backend 'pt'来对训练后的模型进行推理,如果要使用vllm/lmdeploy进行加速,可以参考:https://github.com/modelscope/ms-swift/blob/main/examples/infer/demo_mllm.py
engine = PtEngine(model) engine.model = Swift.from_pretrained(engine.model, last_model_checkpoint) engine.model.requires_grad_() # 修复peft将embedding层的requires_grad设置为True template = get_template(engine.model.model_meta.template, engine.tokenizer) model_parameter_info = get_model_parameter_info(engine.model) logger.info(f'model_parameter_info: {model_parameter_info}')
获取验证集:
# 由于设置了data_seed,这里的验证集即为训练时的验证集 _, val_dataset = load_dataset(dataset, split_dataset_ratio=split_dataset_ratio, num_proc=num_proc, strict=strict, seed=data_seed) val_dataset = val_dataset.select(range(10)) # 取前10条
流式推理,并保存验证集中的图片:
def infer_stream(engine: InferEngine, infer_request: InferRequest): request_config = RequestConfig(max_tokens=max_new_tokens, temperature=temperature, stream=True) gen = engine.infer([infer_request], request_config) query = infer_request.messages[0]['content'] print(f'query: {query}\nresponse: ', end='') for resp_list in gen: print(resp_list[0].choices[0].delta.content, end='', flush=True) print() os.makedirs('images', exist_ok=True) for i, data in enumerate(val_dataset): image = load_image(data['images'][0]['bytes']) image.save(f'images/{i}.png') infer_stream(engine, InferRequest(**data)) print('-' * 50)
推理效果:
点击链接查看原文:ModelScope 魔搭社区