1. 数据集准备

这个没啥好说的，因为可以发现mmdet只支持voc格式'VOCDataset'或者是coco的格式'CocoDataset'，所以对于自己的数据集，需要将标注文件转换成coco的格式或者是voc的格式。（没有yolo格式）

一般来说，还是转化成coco格式比较方便，对标注的文件构建一个coco格式的json文件即可。

COCO 格式实例分割的必要键如下：

{
    "images": [image],
    "annotations": [annotation],
    "categories": [category]
}
image = {
    "id": int,
    "width": int,
    "height": int,
    "file_name": str,
}
annotation = {
    "id": int,
    "image_id": int,
    "category_id": int,
    "segmentation": RLE or [polygon],
    "area": float,
    "bbox": [x,y,width,height],
    "iscrowd": 0 or 1,
}
categories = [{
    "id": int,
    "name": str,
    "supercategory": str,
}]

完整的coco格式介绍如下：https://cocodataset.org/#format-data，只要完成上面这些必要信息的编写，将自己数据集的相关标注格式构建成一个json文件即可，然后我们就可以CocoDataset用来训练和评估模型了。（同样的，你当然也可以转换成voc的格式VOCDataset，不过现在coco格式已经成为了主流）

参考coco的数据存放格式：

对于存放图像的训练数据集或者是验证数据集，这里coco会单独为其构建一个标注的json文件。基于此进行参考。

由于我们本来是跑yolov5的项目，属于数据集的构建的本来格式是：

|-----image
  |-----train
  |-----val
|-----label
  |-----train
  |-----val
|-----mask.yml

其中，label中存放是txt文件，image中存放的是具体的图像文件，用过yolov5项目的朋友应该都知道，这里就不详细介绍了。这里，我们参考coco数据集的格式，将我们自己的数据集进行相同的配置，首先需要将txt标注文件转换成一个coco格式的json文件。

yolo2coco转换脚本：

import os
import json
import random
import time
from PIL import Image
import csv
# print('chance the file')
task = 'train'  # or 'val'
# root = r'/home/lab/LLC/mmdetection/mmdetection-2.24.0'
coco_format_save_path = '../../data/mask/annotations'  # 要生成的标准coco格式标签所在文件夹
yolo_format_annotation_path = '../../data/mask/labels/' + task  # yolo格式标签所在文件夹
img_pathDir = '../../data/mask/images/' + task  # 图片所在文件夹
assert os.path.exists(coco_format_save_path), "Please mkdir the save path"
# 类别设置
categories = []
class_names = ['with_mask', 'without_mask', 'mask_weared_incorrect']
for label in class_names:
    categories.append({'id': class_names.index(label), 'name': label, 'supercategory': ""})
write_json_context = dict()  # 写入.json文件的大字典
# write_json_context['licenses'] = [{'name': "", 'id': 0, 'url': ""}]
# write_json_context['info'] = {'contributor': "", 'date_created': "", 'description': "",
#                               'url': "", 'version': "", 'year': ""}
write_json_context['categories'] = categories
write_json_context['images'] = []
write_json_context['annotations'] = []
# 接下来的代码主要添加'images'和'annotations'的key值
imageFileList = os.listdir(img_pathDir)
# 遍历该文件夹下的所有文件，并将所有文件名添加到列表中
img_id = 0  # 图片编号
anno_id = 0     # 标注标号
for i, imageFile in enumerate(imageFileList):
    if '_' not in imageFile:
        img_id += 1
        imagePath = os.path.join(img_pathDir, imageFile)  # 获取图片的绝对路径
        image = Image.open(imagePath)  # 读取图片
        W, H = image.size  # 获取图片的高度宽度
        img_context = {}  # 使用一个字典存储该图片信息
        # img_name=os.path.basename(imagePath)
        img_context['id'] = img_id  # 每张图像的唯一ID索引
        img_context['width'] = W
        img_context['height'] = H
        img_context['file_name'] = imageFile
        # img_context['license'] = 0
        # img_context['flickr_url'] = ""
        # img_context['color_url'] = ""
        # img_context['date_captured'] = ""
        write_json_context['images'].append(img_context)  # 将该图片信息添加到'image'列表中
        txtFile = imageFile.split('.')[0] + '.txt'  # 获取该图片获取的txt文件
        with open(os.path.join(yolo_format_annotation_path, txtFile), 'r') as fr:
            lines = fr.readlines()  # 读取txt文件的每一行数据，lines2是一个列表，包含了一个图片的所有标注信息
        for j, line in enumerate(lines):
            anno_id += 1  # 标注的id从1开始
            bbox_dict = {}  # 将每一个bounding box信息存储在该字典中
            class_id, x, y, w, h = line.strip().split(' ')  # 获取每一个标注框的详细信息
            class_id, x, y, w, h = int(class_id), float(x), float(y), float(w), float(h)  # 将字符串类型转为可计算的int和float类型
            # 坐标转换
            xmin = (x - w / 2) * W
            ymin = (y - h / 2) * H
            xmax = (x + w / 2) * W
            ymax = (y + h / 2) * H
            w = w * W
            h = h * H
            height, width = abs(ymax - ymin), abs(xmax - xmin)
            # bounding box的坐标信息
            bbox_dict['id'] = anno_id               # 每个标注信息的索引
            bbox_dict['image_id'] = img_id          # 当前图像的ID索引
            bbox_dict['category_id'] = class_id     # 类别信息
            bbox_dict['segmentation'] = [[xmin, ymin, xmax, ymin, xmax, ymax, xmin, ymax]]
            bbox_dict['area'] = height * width
            bbox_dict['bbox'] = [xmin, ymin, w, h]  # 注意目标类别要加一
            bbox_dict['iscrowd'] = 0
            bbox_dict['attributes'] = ""
            write_json_context['annotations'].append(bbox_dict)  # 将每一个由字典存储的bounding box信息添加到'annotations'列表中
name = os.path.join(coco_format_save_path, task + '.json')
with open(name, 'w') as fw:  # 将字典信息写入.json文件中
    json.dump(write_json_context, fw, indent=4, ensure_ascii=False)
print('finish converters')

ps：使用这个脚本需要自行设置路径和设置类别，将class_names设置为自己数据集的类别即可。

然后，删除无关的文件夹，配置成如下结构即可。

2. 准备配置文件

接着是准备配置，从而可以成功加载数据集。假设配置在目录下configs/balloon/并命名为mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_balloon.py，对于模型来说，首先需要先进行继承，然后再进行更改，参考配置如下：（个人认为的两个重点是设置dataset_type数据集类型，还需要设置类别名称与最后的预测类别数）

# The new config inherits a base config to highlight the necessary modification
_base_ = 'mask_rcnn/mask_rcnn_r50_caffe_fpn_mstrain-poly_1x_coco.py'
# We also need to change the num_classes in head to match the dataset's annotation
model = dict(
    roi_head=dict(
        bbox_head=dict(num_classes=1),
        mask_head=dict(num_classes=1)))
# Modify dataset related settings
dataset_type = 'COCODataset'
classes = ('balloon',)
data = dict(
    train=dict(
        img_prefix='balloon/train/',
        classes=classes,
        ann_file='balloon/train/annotation_coco.json'),
    val=dict(
        img_prefix='balloon/val/',
        classes=classes,
        ann_file='balloon/val/annotation_coco.json'),
    test=dict(
        img_prefix='balloon/val/',
        classes=classes,
        ann_file='balloon/val/annotation_coco.json'))
# We can use the pre-trained Mask RCNN model to obtain higher performance
load_from = 'checkpoints/mask_rcnn_r50_caffe_fpn_mstrain-poly_3x_coco_bbox_mAP-0.408__segm_mAP-0.37_20200504_163245-42aa3d00.pth'

上面是官方的参考，下面来对我们的数据集进行配置。

数据集路径配置：

train_dataset = dict(
    type='MultiImageMixDataset',
    dataset=dict(
        type=dataset_type,
        classes=classes,
        ann_file=data_root + 'annotations/train.json',
        img_prefix=data_root + 'train/',
        pipeline=[
            dict(type='LoadImageFromFile'),
            dict(type='LoadAnnotations', with_bbox=True)
        ],
        filter_empty_gt=False,
    ),
    pipeline=train_pipeline)
......
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    persistent_workers=True,
    train=train_dataset,
    val=dict(
        type=dataset_type,
        classes=classes,
        ann_file=data_root + 'annotations/val.json',
        img_prefix=data_root + 'val/',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        classes=classes,
        ann_file=data_root + 'annotations/val.json',
        img_prefix=data_root + 'val/',
        pipeline=test_pipeline))
......

ps：ann_file表存放标注文件的路径，img_prefix表示存放图像的目录

这里我使用的yolox-s模型进行训练，所以在对于的config/yolox目录下，改写yolox_s_8x8_300e_coco.py配置文件即可。

主要的更改是自定义的数据集路径，已经模型设置，由于这里我的口罩检测数据集只有3类，所以在对于的num_classes设置为3.

模型参数配置：

# model settings
model = dict(
    type='YOLOX',
    input_size=img_scale,
    random_size_range=(15, 25),
    random_size_interval=10,
    backbone=dict(type='CSPDarknet', deepen_factor=0.33, widen_factor=0.5),
    neck=dict(
        type='YOLOXPAFPN',
        in_channels=[128, 256, 512],
        out_channels=128,
        num_csp_blocks=1),
    bbox_head=dict(
        type='YOLOXHead', num_classes=3, in_channels=128, feat_channels=128),  # 需要改变为3类
    train_cfg=dict(assigner=dict(type='SimOTAAssigner', center_radius=2.5)),
    # In order to align the source code, the threshold of the val phase is
    # 0.01, and the threshold of the test phase is 0.001.
    test_cfg=dict(score_thr=0.01, nms=dict(type='nms', iou_threshold=0.65)))

此外，由于类别名称不一样，还需要设置名称。

类别名称设置：

# dataset settings
data_root = 'data/mask/'
dataset_type = 'CocoDataset'
classes = ('with_mask', 'without_mask', 'mask_weared_incorrect')
train_dataset = dict(
    type='MultiImageMixDataset',
    dataset=dict(
        type=dataset_type,
        classes=classes,
        ann_file=data_root + 'annotations/train.json',
        img_prefix=data_root + 'train/',
        pipeline=[
            dict(type='LoadImageFromFile'),
            dict(type='LoadAnnotations', with_bbox=True)
        ],
        filter_empty_gt=False,
    ),
    pipeline=train_pipeline)
...
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    persistent_workers=True,
    train=train_dataset,
    val=dict(
        type=dataset_type,
        classes=classes,
        ann_file=data_root + 'annotations/val.json',
        img_prefix=data_root + 'val/',
        pipeline=test_pipeline),
    test=dict(
        type=dataset_type,
        classes=classes,
        ann_file=data_root + 'annotations/val.json',
        img_prefix=data_root + 'val/',
        pipeline=test_pipeline))
        ...

3. 训练与推理

设置完配置文件，即可进行训练与推理测试

# Train：
# 设置在单卡0卡上进行训练
python tools/train.py configs/yolox/yolox_s_8x8_300e_mask.py --gpu-id 0
# 设置在0,1,3卡上进行训练
CUDA_VISIBLE_DEVICES=0,1,3 ./tools/dist_train.sh configs/yolox/yolox_s_8x8_300e_mask.py 3
# Test
python tools/test.py configs/yolox/yolox_s_8x8_300e_mask.py work_dirs/yolox_s_8x8_300e_mask/latest.pth --eval bbox --gpu-id 0