CAM论文:Learning Deep Features for Discriminative Localization
CAM的原理是取出全连接层中得到类别C的概率的那一维权值,用W表示。然后对GAP前的feature map进行加权求和,由于此时feature map不是原图像大小,在加权求和后还需要进行上采样,即可得到Class Activation Map。
CAM有个很致命的缺陷,它的结构是由CNN + GAP + FC + Softmax组成。也就是说如果想要可视化某个现有的模型,对于没有GAP的模型来说需要修改原模型结构,并重新训练,相当麻烦,且如果模型很大,在修改后重新训练不一定能达到原效果,可视化也就没有意义了。
Grad-CAM论文:Grad-CAM: Visual Explanations from Deep Networks via Gradient-based Localization
原理:同样是处理CNN特征提取网络的最后一层feature maps。Grad-CAM对于想要可视化的类别C,使最后输出的类别C的概率值通过反向传播到最后一层feature maps,得到类别C对该feature maps的每个像素的梯度值,对每个像素的梯度值取全局平均池化,即可得到对feature maps的加权系数alpha,论文中提到这样获取的加权系数跟CAM中的系数的计算量几乎是等价的。接下来对特征图加权求和,使用ReLU进行修正,再进行上采样。
if __name__ == "__main__": imgs_path = "path/to/image.png" model = models.mobilenet_v3_large(pretrained=True) model.load_state_dict(torch.load('model.pth')) model = model.cuda().eval() #target_layers指的是需要可视化的层,这里可视化最后一层 target_layers = [model.features[-1]] img, data = image_proprecess(imgs_path) data = data.cuda() cam = GradCAM(model=model, target_layers=target_layers) #指定可视化的类别,指定为None,则按照当前预测的最大概率的类作为可视化类。 target_category = None grayscale_cam = cam(input_tensor=data, target_category=target_category) grayscale_cam = grayscale_cam[0, :] visualization = show_cam_on_image(np.array(img) / 255., grayscale_cam) plt.imshow(visualization) plt.xticks() plt.yticks() plt.axis('off') plt.savefig("path/to/gradcam_image.jpg")
def image_proprecess(img_path): img = Image.open(img_path) data_transforms = transforms.Compose([ transforms.Resize((384, 384), interpolation=3), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) data = data_transforms(img) data = torch.unsqueeze(data,0) img_resize = img.resize((384,384)) return img_resize,data
class GradCAM: def __init__(self, model, target_layers, reshape_transform=None): self.model = model.eval() self.target_layers = target_layers self.reshape_transform = reshape_transform self.cuda = use_cuda self.activations_and_grads = ActivationsAndGradients( self.model, target_layers, reshape_transform) """ Get a vector of weights for every channel in the target layer. Methods that return weights channels, will typically need to only implement this function. """ @staticmethod def get_cam_weights(grads): return np.mean(grads, axis=(2, 3), keepdims=True) @staticmethod def get_loss(output, target_category): loss = 0 for i in range(len(target_category)): loss = loss + output[i, target_category[i]] return loss def get_cam_image(self, activations, grads): weights = self.get_cam_weights(grads) weighted_activations = weights * activations cam = weighted_activations.sum(axis=1) return cam @staticmethod def get_target_width_height(input_tensor): width, height = input_tensor.size(-1), input_tensor.size(-2) return width, height def compute_cam_per_layer(self, input_tensor): activations_list = [a.cpu().data.numpy() for a in self.activations_and_grads.activations] grads_list = [g.cpu().data.numpy() for g in self.activations_and_grads.gradients] target_size = self.get_target_width_height(input_tensor) cam_per_target_layer = [] # Loop over the saliency image from every layer for layer_activations, layer_grads in zip(activations_list, grads_list): cam = self.get_cam_image(layer_activations, layer_grads) cam[cam < 0] = 0 # works like mute the min-max scale in the function of scale_cam_image scaled = self.scale_cam_image(cam, target_size) cam_per_target_layer.append(scaled[:, None, :]) return cam_per_target_layer def aggregate_multi_layers(self, cam_per_target_layer): cam_per_target_layer = np.concatenate(cam_per_target_layer, axis=1) cam_per_target_layer = np.maximum(cam_per_target_layer, 0) result = np.mean(cam_per_target_layer, axis=1) return self.scale_cam_image(result) @staticmethod def scale_cam_image(cam, target_size=None): result = [] for img in cam: img = img - np.min(img) img = img / (1e-7 + np.max(img)) if target_size is not None: img = cv2.resize(img, target_size) result.append(img) result = np.float32(result) return result def __call__(self, input_tensor, target_category=None): # 正向传播得到网络输出logits(未经过softmax) output = self.activations_and_grads(input_tensor) if isinstance(target_category, int): target_category = [target_category] * input_tensor.size(0) if target_category is None: target_category = np.argmax(output.cpu().data.numpy(), axis=-1) print(f"category id: {target_category}") else: assert (len(target_category) == input_tensor.size(0)) self.model.zero_grad() loss = self.get_loss(output, target_category) loss.backward(retain_graph=True) # In most of the saliency attribution papers, the saliency is # computed with a single target layer. # Commonly it is the last convolutional layer. # Here we support passing a list with multiple target layers. # It will compute the saliency image for every image, # and then aggregate them (with a default mean aggregation). # This gives you more flexibility in case you just want to # use all conv layers for example, all Batchnorm layers, # or something else. cam_per_layer = self.compute_cam_per_layer(input_tensor) return self.aggregate_multi_layers(cam_per_layer) def __del__(self): self.activations_and_grads.release() def __enter__(self): return self def __exit__(self, exc_type, exc_value, exc_tb): self.activations_and_grads.release() if isinstance(exc_value, IndexError): # Handle IndexError here... print( f"An exception occurred in CAM with block: {exc_type}. Message: {exc_value}") return True
简要说明一下整体在做什么,先通过下方的ActivationsAndGradients获取模型推理过程中的梯度和激活函数值,计算要可视化的类的loss(其它类的都忽略),通过这个loss计算可视化类对应的梯度图,将其进行全局平均池化获得每个feature maps通道的加权系数,与feature maps进行通道上加权,并在通道上做均值获得单通道图,再ReLU即输出对应的图。注:此图还不是热力图,还需要与原图相加才能获得最终的热力图。
cam = GradCAM(model=model, target_layers=target_layers) #指定可视化的类别,指定为None,则按照当前预测的最大概率的类作为可视化类。 target_category = None grayscale_cam = cam(input_tensor=data, target_category=target_category)
class ActivationsAndGradients: """ Class for extracting activations and registering gradients from targeted intermediate layers """ def __init__(self, model, target_layers, reshape_transform): self.model = model self.gradients = [] self.activations = [] self.reshape_transform = reshape_transform self.handles = [] for target_layer in target_layers: self.handles.append( target_layer.register_forward_hook( self.save_activation)) # Backward compatibility with older pytorch versions: if hasattr(target_layer, 'register_full_backward_hook'): self.handles.append( target_layer.register_full_backward_hook( self.save_gradient)) else: self.handles.append( target_layer.register_backward_hook( self.save_gradient)) def save_activation(self, module, input, output): activation = output if self.reshape_transform is not None: activation = self.reshape_transform(activation) self.activations.append(activation.cpu().detach()) def save_gradient(self, module, grad_input, grad_output): # Gradients are computed in reverse order grad = grad_output[0] if self.reshape_transform is not None: grad = self.reshape_transform(grad) self.gradients = [grad.cpu().detach()] + self.gradients def __call__(self, x): self.gradients = [] self.activations = [] return self.model(x) def release(self): for handle in self.handles: handle.remove()
def show_cam_on_image(img: np.ndarray, mask: np.ndarray, use_rgb: bool = False, colormap: int = cv2.COLORMAP_JET) -> np.ndarray: """ This function overlays the cam mask on the image as an heatmap. By default the heatmap is in BGR format. :param img: The base image in RGB or BGR format. :param mask: The cam mask. :param use_rgb: Whether to use an RGB or BGR heatmap, this should be set to True if 'img' is in RGB format. :param colormap: The OpenCV colormap to be used. :returns: The default image with the cam overlay. """ heatmap = cv2.applyColorMap(np.uint8(255 * mask), colormap) if use_rgb: heatmap = cv2.cvtColor(heatmap, cv2.COLOR_BGR2RGB) heatmap = np.float32(heatmap) / 255 if np.max(img) > 1: raise Exception( "The input image should np.float32 in the range [0, 1]") cam = heatmap + img cam = cam / np.max(cam) return np.uint8(255 * cam)
