# PFF 层,基本相当于两个全连接 # 每个 TF 块中位于注意力层之后 class PositionwiseFeedForward(nn.Module): "Implements FFN equation." def __init__(self, d_model, d_ff, dropout=0.1): super(PositionwiseFeedForward, self).__init__() # LL1,权重矩阵尺寸 ES * FF self.w_1 = nn.Linear(d_model, d_ff) # LL2,权重矩阵尺寸 FF * ES self.w_2 = nn.Linear(d_ff, d_model) # Dropout self.dropout = nn.Dropout(dropout) # 激活函数是 GELU self.activation = GELU() def forward(self, x): # 输入 -> LL1 -> GELU -> Dropout -> LL2 -> 输出 return self.w_2(self.dropout(self.activation(self.w_1(x)))) # 处理 TF 块内的残差 class SublayerConnection(nn.Module): """ A residual connection followed by a layer norm. Note for code simplicity the norm is first as opposed to last. """ def __init__(self, size, dropout): super(SublayerConnection, self).__init__() # 层级标准化 self.norm = LayerNorm(size) # Dropout self.dropout = nn.Dropout(dropout) def forward(self, x, sublayer): # 输入 -> LN -> 自定义层 -> Dropout -> 残差连接 -> 输出 # | ⬆ # +------------------------------------+ return x + self.dropout(sublayer(self.norm(x))) # GELU 是 RELU 的高斯平滑近似形式 class GELU(nn.Module): """ Paper Section 3.4, last paragraph notice that BERT used the GELU instead of RELU """ def forward(self, x): return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3)))) # 层级标准化(原理见参考文献) class LayerNorm(nn.Module): "Construct a layernorm module (See citation for details)." def __init__(self, features, eps=1e-6): super(LayerNorm, self).__init__() # 比例参数 self.a_2 = nn.Parameter(torch.ones(features)) # 偏移参数 self.b_2 = nn.Parameter(torch.zeros(features)) # 微小值防止除零错误 self.eps = eps def forward(self, x): # 均值和方差都是对最后一维,也就是嵌入向量计算的 # `keepdim=True`保持维数不变 # 输入尺寸为 BS * ML * ES,计算之后是 BS * ML * 1 mean = x.mean(-1, keepdim=True) std = x.std(-1, keepdim=True) # 将最后一维标准化,然后乘以比例加上偏移 return self.a_2 * (x - mean) / (std + self.eps) + self.b_2 # Transformer 块是任何 Transformer 架构的基本结构,不仅限于 BERT, # 不同模型只是层数、头数、嵌入维度、词表、训练数据以及解码器(具体任务)不同 class TransformerBlock(nn.Module): """ Bidirectional Encoder = Transformer (self-attention) Transformer = MultiHead_Attention + Feed_Forward with sublayer connection """ def __init__(self, hidden, attn_heads, feed_forward_hidden, dropout): """ :param hidden: hidden size of transformer :param attn_heads: head sizes of multi-head attention :param feed_forward_hidden: feed_forward_hidden, usually 4*hidden_size :param dropout: dropout rate """ super().__init__() # 第一部分:注意力层 self.attention = MultiHeadedAttention(h=attn_heads, d_model=hidden) # 第二部分:PFF 层 self.feed_forward = PositionwiseFeedForward(d_model=hidden, d_ff=feed_forward_hidden, dropout=dropout) # 注意力层残差模块 self.input_sublayer = SublayerConnection(size=hidden, dropout=dropout) # PFF 层的残差模块 self.output_sublayer = SublayerConnection(size=hidden, dropout=dropout) # 最后的 Dropout self.dropout = nn.Dropout(p=dropout) def forward(self, x, mask): # 输入 -> LN1 -> 注意力层 -> DropOut1 -> 残差连接 -> ... # | ↑ # +--------------------------------------+ # 这里的注意力层的三个输入全是`x`,但是仍然命名为 QKV,容易引起混淆 x = self.input_sublayer(x, lambda _x: self.attention.forward(_x, _x, _x, mask=mask)) # ... -> LN2 -> FFN -> DropOut2 -> 残差连接 -> ... # | ↑ # +----------------------------------+ x = self.output_sublayer(x, self.feed_forward) # ... -> DropOut3 -> 结果 return self.dropout(x)