使用的yolov5为2021年6月23号的版本v5.0
此篇作为学习笔记,也花了比较大的功夫,尽可能对每一个要点进行了解释
如有一些问题或错误,欢迎大家一起交流。
1. train.py中数据集加载(datasets和dataloader)
在train.py中,通过这一段进行读取数据集.由于加载验证集和加载训练集是同样的方式,这里就用训练集作为例子.
# Trainloader dataloader, dataset = create_dataloader(train_path, imgsz, batch_size // WORLD_SIZE, gs, single_cls, hyp=hyp, augment=True, cache=opt.cache_images, rect=opt.rect, rank=RANK, workers=workers, image_weights=opt.image_weights, quad=opt.quad, prefix=colorstr('train: '))
1.1 create_dataloader()
其中可以看到这一部分的关键核心是LoadImagesAndLabels()
这一步分,后续是作一些dataloader的工作.
所以后续主要围绕LoadImagesAndLabels()
展开
def create_dataloader(path, imgsz, batch_size, stride, single_cls=False, hyp=None, augment=False, cache=False, pad=0.0, rect=False, rank=-1, workers=8, image_weights=False, quad=False, prefix=''): # Make sure only the first process in DDP process the dataset first, and the following others can use the cache with torch_distributed_zero_first(rank): dataset = LoadImagesAndLabels(path, imgsz, batch_size, augment=augment, # augment images hyp=hyp, # augmentation hyperparameters rect=rect, # rectangular training cache_images=cache, single_cls=single_cls, stride=int(stride), pad=pad, image_weights=image_weights, prefix=prefix) batch_size = min(batch_size, len(dataset)) nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, workers]) # number of workers sampler = torch.utils.data.distributed.DistributedSampler(dataset) if rank != -1 else None loader = torch.utils.data.DataLoader if image_weights else InfiniteDataLoader # Use torch.utils.data.DataLoader() if dataset.properties will update during training else InfiniteDataLoader() dataloader = loader(dataset, batch_size=batch_size, num_workers=nw, sampler=sampler, pin_memory=True, collate_fn=LoadImagesAndLabels.collate_fn4 if quad else LoadImagesAndLabels.collate_fn) return dataloader, dataset
1.1.2 LoadImagesAndLabels()第一部分
先解释的是LoadImagesAndLabels中的__init__
部分,也是create_dataloader()中运行部分.
后续LoadImagesAndLabels中的__getitem__
是在训练的时候才调用的,所以暂时不提及.最后会有解释
class LoadImagesAndLabels(Dataset): # for training/testing def __init__(self, path, img_size=640, batch_size=16, augment=False, hyp=None, rect=False, image_weights=False, cache_images=False, single_cls=False, stride=32, pad=0.0, prefix=''): self.img_size = img_size self.augment = augment self.hyp = hyp self.image_weights = image_weights # (bool) self.rect = False if image_weights else rect self.mosaic = self.augment and not self.rect # load 4 images at a time into a mosaic (only during training) self.mosaic_border = [-img_size // 2, -img_size // 2] self.stride = stride self.path = path ''' 下面这一步是为了查找路径(path)里面的所有图片,并将图片相对路径添加到列表f中 f:List (N) N代表总共有N张图片,存放着它们的相对路径 self.img_files:List (N) 就是N张图片的相对路径,经过对 列表f中 图片路径进行处理(兼容windows\linux),并排序 ''' try: f = [] # image files for p in path if isinstance(path, list) else [path]: p = Path(p) # os-agnostic if p.is_dir(): # dir f += glob.glob(str(p / '**' / '*.*'), recursive=True) # 对目录p及其子目录 查找image files # f = list(p.rglob('**/*.*')) # pathlib elif p.is_file(): # file with open(p, 'r') as t: t = t.read().strip().splitlines() parent = str(p.parent) + os.sep f += [x.replace('./', parent) if x.startswith('./') else x for x in t] # local to global path # f += [p.parent / x.lstrip(os.sep) for x in t] # local to global path (pathlib) else: raise Exception(f'{prefix}{p} does not exist') self.img_files = sorted([x.replace('/', os.sep) for x in f if x.split('.')[-1].lower() in img_formats]) # 保证路径的兼容性(比如windows可以使用\或者/,但是linux只能使用/) # self.img_files = sorted([x for x in f if x.suffix[1:].lower() in img_formats]) # pathlib assert self.img_files, f'{prefix}No images found' except Exception as e: raise Exception(f'{prefix}Error loading data from {path}: {e}\nSee {help_url}') ''' self.img_files:List (N) 装的是N张图片的相对路径 self.label_files:List (N) 装的是N张图片对应的标签txt相对路径 cache_path: 缓存文件,本质上是一个字典,(键为图片路径,对应的值为label,除此之外还有一些配置的内容,如results和hash), 图片与对应label建立着关系的缓存文件 ''' # Check cache self.label_files = img2label_paths(self.img_files) # labels cache_path = (p if p.is_file() else Path(self.label_files[0]).parent).with_suffix('.cache') # cached labels if cache_path.is_file(): cache, exists = torch.load(cache_path), True # load if cache.get('version') != 0.3 or cache.get('hash') != get_hash(self.label_files + self.img_files): cache, exists = self.cache_labels(cache_path, prefix), False # re-cache else: cache, exists = self.cache_labels(cache_path, prefix), False # cache ''' 输出cache里面的相关信息 ''' # Display cache nf, nm, ne, nc, n = cache.pop('results') # found, missing, empty, corrupted, total if exists: d = f"Scanning '{cache_path}' images and labels... {nf} found, {nm} missing, {ne} empty, {nc} corrupted" tqdm(None, desc=prefix + d, total=n, initial=n) # display cache results if cache['msgs']: logging.info('\n'.join(cache['msgs'])) # display warnings assert nf > 0 or not augment, f'{prefix}No labels in {cache_path}. Can not train without labels. See {help_url}' ''' 先将cache里面除图片和标签的内容 给去掉(如配置内容results,hash,先pop掉) N张图的标签 self.labels:List (N*ndarray) ndarray:(每张图的的target数,5) 5:c x y w h N张图的形状 self.shapes:List (N*tuple) tuple:(2) 2:w h N张图的路径 self.img_files:List (N) N张图标签(txt)的路径 self.img_files:List (N) ''' # Read cache [cache.pop(k) for k in ('hash', 'version', 'msgs')] # remove items labels, shapes, self.segments = zip(*cache.values()) self.labels = list(labels) self.shapes = np.array(shapes, dtype=np.float64) self.img_files = list(cache.keys()) # update self.label_files = img2label_paths(cache.keys()) # update if single_cls: for x in self.labels: x[:, 0] = 0 ''' n:图片数量 bi:ndarray(N) 每张图片对应所属于的批次 nb: 批次数 = 总图片数量/batch_size ''' n = len(shapes) # number of images bi = np.floor(np.arange(n) / batch_size).astype(np.int) # batch index nb = bi[-1] + 1 # number of batches self.batch = bi # batch index of image self.n = n self.indices = range(n) ''' 矩形训练 ''' # Rectangular Training if self.rect: # Sort by aspect ratio ''' 首先根据高宽比排序,保证每个batch内的图像高宽比相近。 然后获得排序后的图片的索引irect 根据索引irect,对self.img_files, self.label_files, self.labels, self.shapes, ar进行同样的排序 ''' s = self.shapes # wh ar = s[:, 1] / s[:, 0] # aspect ratio irect = ar.argsort() self.img_files = [self.img_files[i] for i in irect] self.label_files = [self.label_files[i] for i in irect] self.labels = [self.labels[i] for i in irect] self.shapes = s[irect] # wh ar = ar[irect] # Set training image shapes ''' bi:(N) batch index i: 当前的批次 (bi==i) :(N) 为bool类型 ari: (batch_size) 当前批次i的所有的图片的aspect ratio mini: 最小高宽比 maxi: 最大高宽比 注意这边shapes是hw形式 ,前面self.shapes是wh形式 当大于1的时候,使用1/mini,小于1的时候取maxi,这样子的话,使得它们(1/mini还是maxi)更加接近1. 对当前batch下图片,其中一条边进行加边.但是每个batch的最终输入图片的尺寸是一样的,这样才能组成(N,C,H,W)的形式 不同batch下的图片的尺寸可能是不一样的.由于空间金字塔池化的存在,不同输入大小的图片最终都可以通过模型,只要当前batch内 图片大小一样就行了. 这里的 self.batch_shapes 的就是后续和train.py中输入的参数 --img-size 起到同一个作用. 如果使用矩形训练,那么使用 self.batch_shapes ,否者就使用 --img-size 后续的__getitem__中,letterbox就是加边的操作,如果这里不明白的话,后续可能就会明白. ''' shapes = [[1, 1]] * nb for i in range(nb): ari = ar[bi == i] mini, maxi = ari.min(), ari.max() if maxi < 1: shapes[i] = [maxi, 1] # 如果宽比高大,那么宽不变,加边高即可, 高取这个batch中,最接近1的. elif mini > 1: shapes[i] = [1, 1 / mini] # 如果高比宽大,那么高固定不变,宽取这个batch图片中,最接近1的. # 最后其实可以发现shapes中 hw都是小于等于1的. 这样就可以尽可能少加边,增快训练速度 self.batch_shapes = np.ceil(np.array(shapes) * img_size / stride + pad).astype(np.int) * stride #变换到原图的尺寸 ''' Cache图片来加快训练 ''' # Cache images into memory for faster training (WARNING: large datasets may exceed system RAM) self.imgs = [None] * n if cache_images: gb = 0 # Gigabytes of cached images self.img_hw0, self.img_hw = [None] * n, [None] * n results = ThreadPool(num_threads).imap(lambda x: load_image(*x), zip(repeat(self), range(n))) pbar = tqdm(enumerate(results), total=n) for i, x in pbar: self.imgs[i], self.img_hw0[i], self.img_hw[i] = x # img, hw_original, hw_resized = load_image(self, i) gb += self.imgs[i].nbytes pbar.desc = f'{prefix}Caching images ({gb / 1E9:.1f}GB)' pbar.close()
2. train.py中读取数据集用来训练
这一步便是调用LoadImagesAndLabels()中__getitem__
for i, (imgs, targets, paths, _) in pbar:
2.1 LoadImagesAndLabels()中 getitem 部分
def __getitem__(self, index): index = self.indices[index] # linear, shuffled, or image_weights hyp = self.hyp ''' 使用mosaic增强 ''' mosaic = self.mosaic and random.random() < hyp['mosaic'] if mosaic: # Load mosaic img, labels = load_mosaic(self, index) shapes = None # MixUp https://arxiv.org/pdf/1710.09412.pdf if random.random() < hyp['mixup']: img2, labels2 = load_mosaic(self, random.randint(0, self.n - 1)) r = np.random.beta(32.0, 32.0) # mixup ratio, alpha=beta=32.0 img = (img * r + img2 * (1 - r)).astype(np.uint8) labels = np.concatenate((labels, labels2), 0) else: ''' load_image是使用opencv读取的,所以img是BGR形式 h0:原图的高 w0:原图的宽 根据self.img_size(即--img-size),将原图缩放,使得其中一条边等于--img-size, 另外一条边小于--img-size 从而得到缩放后的 h 和 w img 为 缩放后的 h 和 w ''' # Load image img, (h0, w0), (h, w) = load_image(self, index) ''' 选择shape,矩形训练使用新的形状,一般训练(正方形),使用--img-size ''' # Letterbox shape = self.batch_shapes[self.batch[index]] if self.rect else self.img_size # final letterboxed shape ''' (letterbox在博文detect.py下有解释.) 根据缩放后的img和需要训练的shape进行加边(padding)操作(统一输入模型的尺寸) ''' img, ratio, pad = letterbox(img, shape, auto=False, scaleup=self.augment) shapes = (h0, w0), ((h / h0, w / w0), pad) # for COCO mAP rescaling labels = self.labels[index].copy() if labels.size: # normalized xywh to pixel xyxy format labels[:, 1:] = xywhn2xyxy(labels[:, 1:], ratio[0] * w, ratio[1] * h, padw=pad[0], padh=pad[1]) ''' 图像增强 ''' if self.augment: # Augment imagespace if not mosaic: img, labels = random_perspective(img, labels, degrees=hyp['degrees'], translate=hyp['translate'], scale=hyp['scale'], shear=hyp['shear'], perspective=hyp['perspective']) # Augment colorspace augment_hsv(img, hgain=hyp['hsv_h'], sgain=hyp['hsv_s'], vgain=hyp['hsv_v']) # Apply cutouts # if random.random() < 0.9: # labels = cutout(img, labels) nL = len(labels) # number of labels if nL: labels[:, 1:5] = xyxy2xywh(labels[:, 1:5]) # convert xyxy to xywh labels[:, [2, 4]] /= img.shape[0] # normalized height 0-1 labels[:, [1, 3]] /= img.shape[1] # normalized width 0-1 if self.augment: # flip up-down if random.random() < hyp['flipud']: img = np.flipud(img) if nL: labels[:, 2] = 1 - labels[:, 2] # flip left-right if random.random() < hyp['fliplr']: img = np.fliplr(img) if nL: labels[:, 1] = 1 - labels[:, 1] labels_out = torch.zeros((nL, 6)) if nL: labels_out[:, 1:] = torch.from_numpy(labels) # Convert img = img[:, :, ::-1].transpose(2, 0, 1) # BGR to RGB, to 3x416x416 #由于使用transpose后仅仅是改变了索引顺序(img中记录了transpose这个信息),没改变内存存储,于是将其连续,使用新的内存空间 img = np.ascontiguousarray(img) return torch.from_numpy(img), labels_out, self.img_files[index], shapes