[Transformer-XL]论文实现:Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context

简介: [Transformer-XL]论文实现:Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context

有时间用tensorflow2.0实现一般,原文tensorflow1.0版本模型的代码如下:

# 完整代码在这里
import tensorflow as tf
def positional_embedding(pos_seq, inv_freq, bsz=None):
  sinusoid_inp = tf.einsum('i,j->ij', pos_seq, inv_freq)
  pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1)
  if bsz is not None:
    return tf.tile(pos_emb[:, None, :], [1, bsz, 1])
  else:
    return pos_emb[:, None, :]
def positionwise_FF(inp, d_model, d_inner, dropout, kernel_initializer,
                    scope='ff', is_training=True):
  output = inp
  with tf.variable_scope(scope):
    output = tf.layers.dense(inp, d_inner, activation=tf.nn.relu,
                             kernel_initializer=kernel_initializer,
                             name='layer_1')
    output = tf.layers.dropout(output, dropout, training=is_training,
                               name='drop_1')
    output = tf.layers.dense(output, d_model,
                             kernel_initializer=kernel_initializer,
                             name='layer_2')
    output = tf.layers.dropout(output, dropout, training=is_training,
                               name='drop_2')
    output = tf.contrib.layers.layer_norm(output + inp, begin_norm_axis=-1)
  return output
def rel_shift(x):
  x_size = tf.shape(x)
  x = tf.pad(x, [[0, 0], [1, 0], [0, 0], [0, 0]])
  x = tf.reshape(x, [x_size[1] + 1, x_size[0], x_size[2], x_size[3]])
  x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1])
  x = tf.reshape(x, x_size)
  return x
def rel_multihead_attn(w, r, r_w_bias, r_r_bias, attn_mask, mems, d_model,
                       n_head, d_head, dropout, dropatt, is_training,
                       kernel_initializer, scope='rel_attn'):
  scale = 1 / (d_head ** 0.5)
  with tf.variable_scope(scope):
    qlen = tf.shape(w)[0]
    rlen = tf.shape(r)[0]
    bsz = tf.shape(w)[1]
    cat = tf.concat([mems, w],
                    0) if mems is not None and mems.shape.ndims > 1 else w
    w_heads = tf.layers.dense(cat, 3 * n_head * d_head, use_bias=False,
                              kernel_initializer=kernel_initializer, name='qkv')
    r_head_k = tf.layers.dense(r, n_head * d_head, use_bias=False,
                               kernel_initializer=kernel_initializer, name='r')
    w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, -1)
    w_head_q = w_head_q[-qlen:]
    klen = tf.shape(w_head_k)[0]
    w_head_q = tf.reshape(w_head_q, [qlen, bsz, n_head, d_head])
    w_head_k = tf.reshape(w_head_k, [klen, bsz, n_head, d_head])
    w_head_v = tf.reshape(w_head_v, [klen, bsz, n_head, d_head])
    r_head_k = tf.reshape(r_head_k, [rlen, n_head, d_head])
    rw_head_q = w_head_q + r_w_bias
    rr_head_q = w_head_q + r_r_bias
    AC = tf.einsum('ibnd,jbnd->ijbn', rw_head_q, w_head_k)
    BD = tf.einsum('ibnd,jnd->ijbn', rr_head_q, r_head_k)
    BD = rel_shift(BD)
    attn_score = (AC + BD) * scale
    attn_mask_t = attn_mask[:, :, None, None]
    attn_score = attn_score * (1 - attn_mask_t) - 1e30 * attn_mask_t
    attn_prob = tf.nn.softmax(attn_score, 1)
    attn_prob = tf.layers.dropout(attn_prob, dropatt, training=is_training)
    attn_vec = tf.einsum('ijbn,jbnd->ibnd', attn_prob, w_head_v)
    size_t = tf.shape(attn_vec)
    attn_vec = tf.reshape(attn_vec, [size_t[0], size_t[1], n_head * d_head])
    attn_out = tf.layers.dense(attn_vec, d_model, use_bias=False,
                               kernel_initializer=kernel_initializer, name='o')
    attn_out = tf.layers.dropout(attn_out, dropout, training=is_training)
    output = tf.contrib.layers.layer_norm(attn_out + w, begin_norm_axis=-1)
  return output
def embedding_lookup(lookup_table, x, use_tpu=True):
    return tf.nn.embedding_lookup(lookup_table, x)
def mask_adaptive_embedding_lookup(x, n_token, d_embed, d_proj, cutoffs, initializer,
  return y, ret_params
def mul_adaptive_embedding_lookup(x, n_token, d_embed, d_proj, cutoffs, initializer,
  return y, ret_params
def mask_adaptive_logsoftmax(hidden, target, n_token, d_embed, d_proj, cutoffs,
  return nll
def mul_adaptive_logsoftmax(hidden, target, n_token, d_embed, d_proj, cutoffs,
  return nll
def _create_mask(qlen, mlen, same_length=False):
  return ret
def _cache_mem(curr_out, prev_mem, mem_len=None):
  return tf.stop_gradient(new_mem)
def transformer(dec_inp, target, mems, n_token, n_layer, d_model, d_embed,
                n_head, d_head, d_inner, dropout, dropatt,
                initializer, is_training, proj_initializer=None,
                mem_len=None, cutoffs=[], div_val=1, tie_projs=[],
                same_length=False, clamp_len=-1, use_tpu=True,
                input_perms=None, target_perms=None, head_target=None,
                untie_r=False, proj_same_dim=True,
                scope='transformer'):
  """
  cutoffs: a list of python int. Cutoffs for adaptive softmax.
  tie_projs: a list of python bools. Whether to tie the projections.
  use_tpu: if True, use one_hot in embedding lookup and bin-based implementation
        of adaptive softmax.
  perms: a list of tensors. Each tensor should of size [len, bsz, bin_size].
        Only used in the adaptive setting.
  """
  new_mems = []
  with tf.variable_scope(scope):
    if untie_r:
      r_w_bias = tf.get_variable('r_w_bias', [n_layer, n_head, d_head],
                               initializer=initializer)
      r_r_bias = tf.get_variable('r_r_bias', [n_layer, n_head, d_head],
                                 initializer=initializer)
    else:
      r_w_bias = tf.get_variable('r_w_bias', [n_head, d_head],
                                 initializer=initializer)
      r_r_bias = tf.get_variable('r_r_bias', [n_head, d_head],
                                 initializer=initializer)
    qlen = tf.shape(dec_inp)[0]
    mlen = tf.shape(mems[0])[0] if mems is not None else 0
    klen = mlen + qlen
    if proj_initializer is None:
      proj_initializer = initializer
    lookup_fn = (mul_adaptive_embedding_lookup if use_tpu else
                 mask_adaptive_embedding_lookup)
    embeddings, shared_params = lookup_fn(
        x=dec_inp,
        n_token=n_token,
        d_embed=d_embed,
        d_proj=d_model,
        cutoffs=cutoffs,
        initializer=initializer,
        proj_initializer=proj_initializer,
        div_val= div_val,
        perms=input_perms,
        proj_same_dim=proj_same_dim)
    attn_mask = _create_mask(qlen, mlen, same_length)
    pos_seq = tf.range(klen - 1, -1, -1.0)
    if clamp_len > 0:
      pos_seq = tf.minimum(pos_seq, clamp_len)
    inv_freq = 1 / (10000 ** (tf.range(0, d_model, 2.0) / d_model))
    pos_emb = positional_embedding(pos_seq, inv_freq)
    output = tf.layers.dropout(embeddings, dropout, training=is_training)
    pos_emb = tf.layers.dropout(pos_emb, dropout, training=is_training)
    if mems is None:
      mems = [None] * n_layer
    for i in range(n_layer):
      # cache new mems
      new_mems.append(_cache_mem(output, mems[i], mem_len))
      with tf.variable_scope('layer_{}'.format(i)):
        output = rel_multihead_attn(
            w=output,
            r=pos_emb,
            r_w_bias=r_w_bias if not untie_r else r_w_bias[i],
            r_r_bias=r_r_bias if not untie_r else r_r_bias[i],
            attn_mask=attn_mask,
            mems=mems[i],
            d_model=d_model,
            n_head=n_head,
            d_head=d_head,
            dropout=dropout,
            dropatt=dropatt,
            is_training=is_training,
            kernel_initializer=initializer)
        output = positionwise_FF(
            inp=output,
            d_model=d_model,
            d_inner=d_inner,
            dropout=dropout,
            kernel_initializer=initializer,
            is_training=is_training)
    output = tf.layers.dropout(output, dropout, training=is_training)
    logsoftmax_fn = (mul_adaptive_logsoftmax if use_tpu else
                     mask_adaptive_logsoftmax)
    loss = logsoftmax_fn(
        hidden=output,
        target=target,
        n_token=n_token,
        d_embed=d_embed,
        d_proj=d_model,
        cutoffs=cutoffs,
        params=shared_params,
        tie_projs=tie_projs,
        initializer=initializer,
        proj_initializer=proj_initializer,
        div_val=div_val,
        perms=target_perms,
        head_target=head_target,
        proj_same_dim=proj_same_dim)
    return loss, new_mems


目录
相关文章
|
2月前
|
机器学习/深度学习 Web App开发 人工智能
轻量级网络论文精度笔(一):《Micro-YOLO: Exploring Efficient Methods to Compress CNN based Object Detection Model》
《Micro-YOLO: Exploring Efficient Methods to Compress CNN based Object Detection Model》这篇论文提出了一种基于YOLOv3-Tiny的轻量级目标检测模型Micro-YOLO,通过渐进式通道剪枝和轻量级卷积层,显著减少了参数数量和计算成本,同时保持了较高的检测性能。
43 2
轻量级网络论文精度笔(一):《Micro-YOLO: Exploring Efficient Methods to Compress CNN based Object Detection Model》
|
2月前
|
机器学习/深度学习 计算机视觉
【小样本图像分割-1】PANet: Few-Shot Image Semantic Segmentation with Prototype Alignment
本文介绍了ICCV 2019的一篇关于小样本图像语义分割的论文《PANet: Few-Shot Image Semantic Segmentation With Prototype Alignment》。PANet通过度量学习方法,从支持集中的少量标注样本中学习类的原型表示,并通过非参数度量学习对查询图像进行分割。该方法在PASCAL-5i数据集上取得了显著的性能提升,1-shot和5-shot设置下的mIoU分别达到48.1%和55.7%。PANet还引入了原型对齐正则化,以提高模型的泛化能力。
78 0
【小样本图像分割-1】PANet: Few-Shot Image Semantic Segmentation with Prototype Alignment
|
4月前
|
机器学习/深度学习 人工智能 算法
【博士每天一篇论文-算法】Collective Behavior of a Small-World Recurrent Neural System With Scale-Free Distrib
本文介绍了一种新型的尺度无标度高聚类回声状态网络(SHESN)模型,该模型通过模拟生物神经系统的特性,如小世界现象和无标度分布,显著提高了逼近复杂非线性动力学系统的能力,并在Mackey-Glass动态系统和激光时间序列预测等问题上展示了其优越的性能。
39 1
【博士每天一篇论文-算法】Collective Behavior of a Small-World Recurrent Neural System With Scale-Free Distrib
|
6月前
|
人工智能 自然语言处理 PyTorch
CLIP(Contrastive Language-Image Pre-training)
CLIP(Contrastive Language-Image Pre-training)
341 0
|
7月前
|
机器学习/深度学习 BI
[RoFormer]论文实现:ROFORMER: ENHANCED TRANSFORMER WITH ROTARY POSITION EMBEDDING
[RoFormer]论文实现:ROFORMER: ENHANCED TRANSFORMER WITH ROTARY POSITION EMBEDDING
63 1
|
7月前
|
机器学习/深度学习 并行计算 关系型数据库
【RetNet】论文解读:Retentive Network: A Successor to Transformer for Large Language Models
【RetNet】论文解读:Retentive Network: A Successor to Transformer for Large Language Models
207 1
|
机器学习/深度学习 数据挖掘
【论文解读】Co-attention network with label embedding for text classification
华南理工出了一篇有意思的文章,将标签和文本进行深度融合,最终形成带标签信息的文本表示和带文本信息的标签表示。
253 1
|
机器学习/深度学习 人工智能 自然语言处理
RoFormer: Enhanced Transformer with Rotary Position Embedding论文解读
位置编码最近在transformer架构中显示出了有效性。它为序列中不同位置的元素之间的依赖建模提供了有价值的监督。
409 0
|
机器学习/深度学习 人工智能 自然语言处理
7 Papers & Radios | 无需注意力的预训练;被GPT带飞的In-Context Learning
7 Papers & Radios | 无需注意力的预训练;被GPT带飞的In-Context Learning
142 0
|
机器学习/深度学习 数据挖掘
【多标签文本分类】HFT-CNN: Learning Hierarchical Category Structure for Multi-label Short Text Categorization
【多标签文本分类】HFT-CNN: Learning Hierarchical Category Structure for Multi-label Short Text Categorization
220 0
【多标签文本分类】HFT-CNN: Learning Hierarchical Category Structure for Multi-label Short Text Categorization