首先是数据集的选择:cifar10, imagenet,svnh
网络的选择:resent20q
1. 首先是配置最佳的GPU,和日志文件的配置
2. 训练集和验证集的设定
3. 量化位宽的设定,并导入模型
4. 学习率的配置,在第几个截断将学习率进行衰减 [100, 150, 180],并设定优化器“SGD”
5. 最新检查点的路径,并对权重进行加载
6. 计算参数量,并在日志中记录
7. 交叉熵损失;软目标损失
8. 开始迭代训练权重
1. 前向传播过程
数据集;模型;交叉熵损失;软交叉熵损失;epoch; training=True;优化器;日志文件
forward(train_loader, model, criterion, criterion_soft, epoch, True, optimizer, sum_writer)
1. 量化位宽列表
2. 构建losses,top1,top2列表,例如(loss_1,loss_2,loss_4,loss_8,loss_32)
3. 开始对数据集进行迭代求解(包括进行训练和不进行训练)
for i, (input, target) in enumerate(data_loader):
1. 如果不进行训练
1. 关闭梯度更新
2. input = input.cuda();target = target.cuda()
3. 开始将不同的量化位宽(bit1,bit2,bit4,bit8,bit32)应用于模型中
1. 生成模型输出output = model(input)
2. 计算模型损失 loss = criterion(output, target)
3. 更新损失loss,top1,top5
2. 进行训练
1. input = input.cuda();target = target.cuda()
2. 优化器更新,清零
3. 训练全精度网络
4. 计算输出output,损失,并进行反向传播,top1,top5
5. 更新损失loss,top1,top5
6. 训练低量化位宽网络,通过全精度网络的输出output生成软目标target_soft
7. 开始将不同的量化位宽(bit1,bit2,bit4,bit8,bit32)应用去训练,并依次进行
1. 生成模型输出output = model(input)
2. 计算模型损失loss = criterion_soft(output, target_soft)
3. 进行反向传播loss.backward()
4. 递归监督,将output经过softmax激活得到target_soft
5.计算top1,top5
6. 更新损失loss,top1,top5
3. optimizer.step()更新所有的参数
4. 如果达到一定的打印条件if i % args.print_freq == 0
1. 打印最大位宽下的(32bit)top1,top5,loss
4. 返回不同比特位(bit1,bit2,bit4,bit8,bit32)情况下的loss,top1,top5(训练状态)
5. 求出每个epoch下训练集的loss,top1,top5
6. 验证集的前向传播,并求出每个epoch下验证集的loss,top1,top5
7. 学习率调整
8. 保存的在验证集准确度较高模型
9. 采用sum_writer对每个量化(bit1,bit2,bit4,bit8,bit32)位,train_loss, train_prec1, train_prec5, val_loss,val_prec1, val_prec5
10. 每个epoch结束后,全精度状态下train_loss, train_prec1, train_prec5,train_prec5,val_loss,val_prec1, val_prec5
import argparse import os import time import socket import logging from datetime import datetime from functools import partial import torch import torch.nn as nn import torch.optim import torch.utils.data from torch.autograd import Variable from tensorboardX import SummaryWriter import models from models.losses import CrossEntropyLossSoft from datasets.data import get_dataset, get_transform from optimizer import get_optimizer_config, get_lr_scheduler from utils import setup_logging, setup_gpus, save_checkpoint from utils import AverageMeter, accuracy parser = argparse.ArgumentParser(description='Training') parser.add_argument('--results-dir', default='./results', help='results dir') parser.add_argument('--dataset', default='cifar10', help='dataset name or folder') parser.add_argument('--train_split', default='train', help='train split name') parser.add_argument('--model', default='resnet20q', help='model architecture') parser.add_argument('--workers', default=0, type=int, help='number of data loading workers') parser.add_argument('--epochs', default=200, type=int, help='number of epochs') parser.add_argument('--start-epoch', default=0, type=int, help='manual epoch number') parser.add_argument('--batch-size', default=128, type=int, help='mini-batch size') parser.add_argument('--optimizer', default='sgd', help='optimizer function used') parser.add_argument('--lr', default=0.1, type=float, help='initial learning rate') parser.add_argument('--lr_decay', default='100,150,180', help='lr decay steps') parser.add_argument('--weight-decay', default=3e-4, type=float, help='weight decay') parser.add_argument('--print-freq', '-p', default=20, type=int, help='print frequency') parser.add_argument('--pretrain', default=None, help='path to pretrained full-precision checkpoint') parser.add_argument('--resume', default=None, help='path to latest checkpoint') parser.add_argument('--bit_width_list', default="1,2,4,8,32", help='bit width list') args = parser.parse_args() def main(): hostname = socket.gethostname() #打印主机名 setup_logging(os.path.join(args.results_dir, 'log_1{}.txt'.format(hostname))) #文件路径+"电脑名" logging.info("running arguments: %s", args) best_gpu = setup_gpus() #最佳GPU编号 torch.cuda.set_device(best_gpu) #设定设备 torch.backends.cudnn.benchmark = True #设为True就可以大大提升卷积神经网络的运行速度 # 2.训练集和验证集的设定 train_transform = get_transform(args.dataset, 'train') #训练集 train_data = get_dataset(args.dataset, args.train_split, train_transform) train_loader = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) val_transform = get_transform(args.dataset, 'val') #验证集 val_data = get_dataset(args.dataset, 'val', val_transform) val_loader = torch.utils.data.DataLoader(val_data, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) # 3. 量化位宽的设定,并导入模型 bit_width_list = list(map(int, args.bit_width_list.split(','))) # [1, 2, 4, 8, 32] bit_width_list.sort() #GPU排序 model = models.__dict__[args.model](bit_width_list, train_data.num_classes).cuda() # 4.学习率的配置,在第几个截断将学习率进行衰减 [100, 150, 180],并设定优化器“SGD” lr_decay = list(map(int, args.lr_decay.split(','))) optimizer = get_optimizer_config(model, args.optimizer, args.lr, args.weight_decay) lr_scheduler = None best_prec1 = None # 5.最新检查点的路径,并对权重进行加载。 if args.resume and args.resume != 'None': if os.path.isdir(args.resume): args.resume = os.path.join(args.resume, 'model_best.pth.tar') if os.path.isfile(args.resume): checkpoint = torch.load(args.resume, map_location='cuda:{}'.format(best_gpu)) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler = get_lr_scheduler(args.optimizer, optimizer, lr_decay, checkpoint['epoch']) logging.info("loaded resume checkpoint '%s' (epoch %s)", args.resume, checkpoint['epoch']) else: raise ValueError('Pretrained model path error!') #如果全精度模型的路径,而且路径不是空 elif args.pretrain and args.pretrain != 'None': if os.path.isdir(args.pretrain): args.pretrain = os.path.join(args.pretrain, 'model_best.pth.tar') if os.path.isfile(args.pretrain): checkpoint = torch.load(args.pretrain, map_location='cuda:{}'.format(best_gpu)) model.load_state_dict(checkpoint['state_dict'], strict=False) logging.info("loaded pretrain checkpoint '%s' (epoch %s)", args.pretrain, checkpoint['epoch']) else: raise ValueError('Pretrained model path error!') if lr_scheduler is None: #optimizer 优化器指标 lr_decay 学习率衰减系数 [100, 150, 180] lr_scheduler = get_lr_scheduler(args.optimizer, optimizer, lr_decay) # 6. 计算参数量,并在日志中记录。 num_parameters = sum([l.nelement() for l in model.parameters()]) logging.info("number of parameters: %d", num_parameters) # 7. 交叉熵损失;软目标损失 criterion = nn.CrossEntropyLoss().cuda() criterion_soft = CrossEntropyLossSoft().cuda() sum_writer = SummaryWriter(args.results_dir + '/summary') # 8. 开始迭代训练权重 for epoch in range(args.start_epoch, args.epochs): model.train() # train_loss, train_prec1, train_prec5 = forward(train_loader, model, criterion, criterion_soft, epoch, True, optimizer, sum_writer) # 6. 验证集的前向传播,并求出每个epoch下验证集的val_loss,val_prec1,val_prec5 model.eval() val_loss, val_prec1, val_prec5 = forward(val_loader, model, criterion, criterion_soft, epoch, False) # 7. 学习速率的调整 if isinstance(lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau): lr_scheduler.step(val_loss) else: lr_scheduler.step() # 8. 保存的在验证集准确度较高模型 if best_prec1 is None: is_best = True best_prec1 = val_prec1[-1] else: is_best = val_prec1[-1] > best_prec1 best_prec1 = max(val_prec1[-1], best_prec1) save_checkpoint( { 'epoch': epoch + 1, 'model': args.model, 'state_dict': model.state_dict(), 'best_prec1': best_prec1, 'optimizer': optimizer.state_dict() }, is_best, path=args.results_dir + '/ckpt') # if sum_writer is not None: sum_writer.add_scalar('lr', optimizer.param_groups[0]['lr'], global_step=epoch) for bw, tl, tp1, tp5, vl, vp1, vp5 in zip(bit_width_list, train_loss, train_prec1, train_prec5, val_loss, val_prec1, val_prec5): sum_writer.add_scalar('train_loss_{}'.format(bw), tl, global_step=epoch) sum_writer.add_scalar('train_prec_1_{}'.format(bw), tp1, global_step=epoch) sum_writer.add_scalar('train_prec_5_{}'.format(bw), tp5, global_step=epoch) sum_writer.add_scalar('val_loss_{}'.format(bw), vl, global_step=epoch) sum_writer.add_scalar('val_prec_1_{}'.format(bw), vp1, global_step=epoch) sum_writer.add_scalar('val_prec_5_{}'.format(bw), vp5, global_step=epoch) logging.info('Epoch {}: \ntrain loss {:.2f}, train prec1 {:.2f}, train prec5 {:.2f}\n' ' val loss {:.2f}, val prec1 {:.2f}, val prec5 {:.2f}'.format( epoch, train_loss[-1], train_prec1[-1], train_prec5[-1], val_loss[-1], val_prec1[-1], val_prec5[-1])) def forward(data_loader, model, criterion, criterion_soft, epoch, training=True, optimizer=None, sum_writer=None): # 1.量化位宽列表 bit_width_list = list(map(int, args.bit_width_list.split(','))) bit_width_list.sort() # 2.构建losses,top1,top2列表 losses = [AverageMeter() for _ in bit_width_list] top1 = [AverageMeter() for _ in bit_width_list] top5 = [AverageMeter() for _ in bit_width_list] # 3. 开始对数据集进行迭代求解 for i, (input, target) in enumerate(data_loader): if not training: # with torch.no_grad(): #1. 关闭梯度更新 input = input.cuda() target = target.cuda(non_blocking=True) # 3. 开始将不同的量化位宽应用于模型中 for bw, am_l, am_t1, am_t5 in zip(bit_width_list, losses, top1, top5): model.apply(lambda m: setattr(m, 'wbit', bw)) model.apply(lambda m: setattr(m, 'abit', bw)) output = model(input) loss = criterion(output, target) prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) am_l.update(loss.item(), input.size(0)) am_t1.update(prec1.item(), input.size(0)) am_t5.update(prec5.item(), input.size(0)) else: input = input.cuda() target = target.cuda(non_blocking=True) #2. 开启优化器更新 optimizer.zero_grad() #3. 训练全精度网络 model.apply(lambda m: setattr(m, 'wbit', bit_width_list[-1])) #设置属性 wbit 值 model.apply(lambda m: setattr(m, 'abit', bit_width_list[-1])) #设置属性 abit 值 # 4. 计算输出,损失,并进行反向传播,top1,top5 output = model(input) loss = criterion(output, target) loss.backward() prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) # 5. 更新损失loss,top1,top5 losses[-1].update(loss.item(), input.size(0)) top1[-1].update(prec1.item(), input.size(0)) top5[-1].update(prec5.item(), input.size(0)) # 6. 训练低量化位宽网络,通过全精度网络的输出生成软目标 target_soft = torch.nn.functional.softmax(output.detach(), dim=1) #生成的软目标 for bw, am_l, am_t1, am_t5 in zip(bit_width_list[:-1][::-1], losses[:-1][::-1], top1[:-1][::-1], top5[:-1][::-1]): model.apply(lambda m: setattr(m, 'wbit', bw)) #更改bit位 model.apply(lambda m: setattr(m, 'abit', bw)) #更改激活值的bit位 # 1. 生成模型输出output = model(input) output = model(input) #将模型输入 # hard cross entropy # loss = criterion(output, target) # soft cross entropy loss = criterion_soft(output, target_soft) #2. 计算模型损失 loss.backward() #开始进行反向传播 # recursive supervision 递归监督 target_soft = torch.nn.functional.softmax(output.detach(), dim=1) #经过softmax激活 #更新准确度,损失等信息 prec1, prec5 = accuracy(output.data, target, topk=(1, 5)) am_l.update(loss.item(), input.size(0)) am_t1.update(prec1.item(), input.size(0)) am_t5.update(prec5.item(), input.size(0)) optimizer.step() #更新所有的参数 if i % args.print_freq == 0: logging.info('epoch {0}, iter {1}/{2}, bit_width_max loss {3:.2f}, prec1 {4:.2f}, prec5 {5:.2f}'.format( epoch, i, len(data_loader), losses[-1].val, top1[-1].val, top5[-1].val)) #4. 返回不同比特位(bit1,bit2,bit4,bit8,bit32)情况下的loss,top1,top5 return [_.avg for _ in losses], [_.avg for _ in top1], [_.avg for _ in top5] if __name__ == '__main__': main()