1. RaftMLP
结构上与Vip,HireMLP等等类似,感觉没有太多的亮点,都是对h,w方向作改变。不过提出了多尺度的进行patch embedding操作,主要是对nn.Unfold函数的使用,其对一个卷积核大小的区域提取出来成为一个patch。
RaftMLP architecture
Pseudocode of raft-token-mixing block (Pytorch-like)
class RaftTokenMixingBlock(nn.Module): # b: size of mini -batch, h: height, w: width, # c: channel, r: size of raft (number of groups), o: c//r, # e: expansion factor, # x: input tensor of shape (h, w, c) def __init__(self): self.lnv = nn.LayerNorm(c) self.lnh = nn.LayerNorm(c) self.fnv1 = nn.Linear(r * h, r * h * e) self.fnv2 = nn.Linear(r * h * e, r * h) self.fnh1 = nn.Linear(r * w, r * w * e) self.fnh2 = nn.Linear(r * w * e, r * w) def forward(self, x): """ x: b, hw, c """ # Vertical-Mixing Block y = self.lnv(x) y = rearrange(y, 'b (h w) (r o) -> b (o w) (r h)') y = self.fcv1(y) y = F.gelu(y) y = self.fcv2(y) y = rearrange(y, 'b (o w) (r h) -> b (h w) (r o)') y = x + y # Horizontal-Mixing Block y = self.lnh(y) y = rearrange(y, 'b (h w) (r o) -> b (o h) (r w)') y = self.fch1(y) y = F.gelu(y) y = self.fch2(y) y = rearrange(y, 'b (o h) (r w) -> b (h w) (r o)') return x + y
这个伪代码已经很详细了。
- Multi-scale Patch Embedding
以下是我根据paper提出的伪代码写的关于多尺度进行patch embedding代码:
import torch import torch.nn as nn from timm.models.layers import to_2tuple from einops.layers.torch import Rearrange # b: size of mini-batch, h: height, w: width, # kernels: list of kernel sizes for unfold. # e.g., [4, 8] class MultiScalePatchEmbedding(nn.Module): def __init__(self, in_channels, out_channels, kernels): super().__init__() self.stride = 1 # mlp_in_channels = ∏(kernels**2)* channels mlp_in_channels = 0 for k in kernels: mlp_in_channels += k ** 2 mlp_in_channels *= in_channels self.embeddings = nn.ModuleList([ # to simple: L = (spatial_size[d] // stride[d]) + 1 nn.Sequential(*[nn.Unfold( kernel_size=to_2tuple(k), stride=self.stride, padding=k // 2), # I chance the code here: (k-stride)//2 -> k//2 Rearrange("b c hw -> b hw c") ]) for k in kernels ]) self.fc = nn.Linear(mlp_in_channels, out_channels) def forward(self, input): b, _, h, w = input.shape outputs = [] for emb in self.embeddings: output = emb(input) outputs.append(output) # input: [1, 48, 56, 56] # kernels: [4, 8] # output: [1, 3249, 3840] -> 3840 = 768(4*4*48) + 3072(8*8*48) outputs = torch.cat(outputs, dim=2) # [1, 3249, 3840] -> # [1, 3249, 96] outputs = self.fc(outputs) return outputs # (b, patch_nums, patch_embeddings) if __name__ == '__main__': model = MultiScalePatchEmbedding(48, 96, [4, 8]) input = torch.rand([1, 48, 56, 56]) outputs = model(input) print(outputs.shape)
详细介绍可参考:https://blog.csdn.net/P_LarT/article/details/120694405
2. DynaMixer
DynaMixer提出通过利用所有要混合的 token 的内容来动态生成混合矩阵,其实就是对h,w方向的每一篇做一个注意力机制,改善了现有基于固定权重的特征矩阵。其中,这个混合矩阵可以是针对一整个channel,也可以对channel进行分组,对每一组生成混合矩阵再进行拼接出来。
- DynaMixer architecture
- DynaMixer operation for one segment
Algorithm 1 Pseudo-code for DynaMixer Block (PyTorch-like)
###### initializaiton ####### proj_c = nn.Linear(D, D) proj_o = nn.Linear(D, D) ###### code in forward ###### def dyna_mixer_block(self, X): H, W, D = X.shape # row mixing for h = 1:H Y_h[h,:,:] = DynaMixerOp_h(X[h,:,:]) # column mixing for w = 1:W Y_w[:,w,:] = DynaMixerOp_w(X[:,w,:]) # channel mixing Y_c = proj_c(X) Y_out = Y_h + Y_w + Y_c return proj_o(Y_out)
下面是我根据paper与伪代码写的DynaMixer Block结构:
import torch import torch.nn as nn from einops import rearrange class DynaMixerOperation(nn.Module): def __init__(self, N, D, d=10): super().__init__() self.N = N self.D = D self.d = d self.fc_ND = nn.Linear(D, d) self.fc_Nd = nn.Linear(N*d, N*N) self.softmax = nn.Softmax(dim=2) def forward(self, input): B, D, N = input.shape # Dynamic mixing matrix generation input = rearrange(input, 'b d n -> b n d') p = self.fc_ND(input) p = p.reshape(-1, 1, N*self.d) p = self.fc_Nd(p) p = p.reshape(-1, N, N) p = self.softmax(p) out = torch.matmul(p, input) out = rearrange(out, 'b n d -> b d n') return out class DynaMixerBlock(nn.Module): def __init__(self, channels=48, imagesize=(54, 58)): super().__init__() h_size = imagesize[0] w_size = imagesize[1] self.dynamixer_op_h = DynaMixerOperation(w_size, channels) self.dynamixer_op_w = DynaMixerOperation(h_size, channels) self.proj_c = nn.Conv2d(channels, channels, kernel_size=1) self.proj_o = nn.Conv2d(channels, channels, kernel_size=1) def forward(self, input): b, c, h, w = input.shape # row mixing Y_h = input for i in range(h): Y_h[:, :, i, :] = self.dynamixer_op_h(input[:, :, i, :]) # (b c w) # column mixing Y_w = input for i in range(w): Y_w[:, :, :, i] = self.dynamixer_op_w(input[:, :, :, i]) # (b c h) # channel mixing Y_c = self.proj_c(input) Y_out = Y_h + Y_w + Y_c return self.proj_o(Y_out) if __name__ == '__main__': SEED = 42 torch.manual_seed(SEED) model = DynaMixerBlock() input = torch.rand([1, 48, 54, 58]) # (b c h w) out = model(input) print(out.shape)
这里我没有对通道进行分组,直接按channel = D来进行编写的。