[Transformer-XL]论文实现:Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context

简介: [Transformer-XL]论文实现:Transformer-XL: Attentive Language Models Beyond a Fixed-Length Context

有时间用tensorflow2.0实现一般,原文tensorflow1.0版本模型的代码如下:

# 完整代码在这里
import tensorflow as tf
def positional_embedding(pos_seq, inv_freq, bsz=None):
  sinusoid_inp = tf.einsum('i,j->ij', pos_seq, inv_freq)
  pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1)
  if bsz is not None:
    return tf.tile(pos_emb[:, None, :], [1, bsz, 1])
  else:
    return pos_emb[:, None, :]
def positionwise_FF(inp, d_model, d_inner, dropout, kernel_initializer,
                    scope='ff', is_training=True):
  output = inp
  with tf.variable_scope(scope):
    output = tf.layers.dense(inp, d_inner, activation=tf.nn.relu,
                             kernel_initializer=kernel_initializer,
                             name='layer_1')
    output = tf.layers.dropout(output, dropout, training=is_training,
                               name='drop_1')
    output = tf.layers.dense(output, d_model,
                             kernel_initializer=kernel_initializer,
                             name='layer_2')
    output = tf.layers.dropout(output, dropout, training=is_training,
                               name='drop_2')
    output = tf.contrib.layers.layer_norm(output + inp, begin_norm_axis=-1)
  return output
def rel_shift(x):
  x_size = tf.shape(x)
  x = tf.pad(x, [[0, 0], [1, 0], [0, 0], [0, 0]])
  x = tf.reshape(x, [x_size[1] + 1, x_size[0], x_size[2], x_size[3]])
  x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1])
  x = tf.reshape(x, x_size)
  return x
def rel_multihead_attn(w, r, r_w_bias, r_r_bias, attn_mask, mems, d_model,
                       n_head, d_head, dropout, dropatt, is_training,
                       kernel_initializer, scope='rel_attn'):
  scale = 1 / (d_head ** 0.5)
  with tf.variable_scope(scope):
    qlen = tf.shape(w)[0]
    rlen = tf.shape(r)[0]
    bsz = tf.shape(w)[1]
    cat = tf.concat([mems, w],
                    0) if mems is not None and mems.shape.ndims > 1 else w
    w_heads = tf.layers.dense(cat, 3 * n_head * d_head, use_bias=False,
                              kernel_initializer=kernel_initializer, name='qkv')
    r_head_k = tf.layers.dense(r, n_head * d_head, use_bias=False,
                               kernel_initializer=kernel_initializer, name='r')
    w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, -1)
    w_head_q = w_head_q[-qlen:]
    klen = tf.shape(w_head_k)[0]
    w_head_q = tf.reshape(w_head_q, [qlen, bsz, n_head, d_head])
    w_head_k = tf.reshape(w_head_k, [klen, bsz, n_head, d_head])
    w_head_v = tf.reshape(w_head_v, [klen, bsz, n_head, d_head])
    r_head_k = tf.reshape(r_head_k, [rlen, n_head, d_head])
    rw_head_q = w_head_q + r_w_bias
    rr_head_q = w_head_q + r_r_bias
    AC = tf.einsum('ibnd,jbnd->ijbn', rw_head_q, w_head_k)
    BD = tf.einsum('ibnd,jnd->ijbn', rr_head_q, r_head_k)
    BD = rel_shift(BD)
    attn_score = (AC + BD) * scale
    attn_mask_t = attn_mask[:, :, None, None]
    attn_score = attn_score * (1 - attn_mask_t) - 1e30 * attn_mask_t
    attn_prob = tf.nn.softmax(attn_score, 1)
    attn_prob = tf.layers.dropout(attn_prob, dropatt, training=is_training)
    attn_vec = tf.einsum('ijbn,jbnd->ibnd', attn_prob, w_head_v)
    size_t = tf.shape(attn_vec)
    attn_vec = tf.reshape(attn_vec, [size_t[0], size_t[1], n_head * d_head])
    attn_out = tf.layers.dense(attn_vec, d_model, use_bias=False,
                               kernel_initializer=kernel_initializer, name='o')
    attn_out = tf.layers.dropout(attn_out, dropout, training=is_training)
    output = tf.contrib.layers.layer_norm(attn_out + w, begin_norm_axis=-1)
  return output
def embedding_lookup(lookup_table, x, use_tpu=True):
    return tf.nn.embedding_lookup(lookup_table, x)
def mask_adaptive_embedding_lookup(x, n_token, d_embed, d_proj, cutoffs, initializer,
  return y, ret_params
def mul_adaptive_embedding_lookup(x, n_token, d_embed, d_proj, cutoffs, initializer,
  return y, ret_params
def mask_adaptive_logsoftmax(hidden, target, n_token, d_embed, d_proj, cutoffs,
  return nll
def mul_adaptive_logsoftmax(hidden, target, n_token, d_embed, d_proj, cutoffs,
  return nll
def _create_mask(qlen, mlen, same_length=False):
  return ret
def _cache_mem(curr_out, prev_mem, mem_len=None):
  return tf.stop_gradient(new_mem)
def transformer(dec_inp, target, mems, n_token, n_layer, d_model, d_embed,
                n_head, d_head, d_inner, dropout, dropatt,
                initializer, is_training, proj_initializer=None,
                mem_len=None, cutoffs=[], div_val=1, tie_projs=[],
                same_length=False, clamp_len=-1, use_tpu=True,
                input_perms=None, target_perms=None, head_target=None,
                untie_r=False, proj_same_dim=True,
                scope='transformer'):
  """
  cutoffs: a list of python int. Cutoffs for adaptive softmax.
  tie_projs: a list of python bools. Whether to tie the projections.
  use_tpu: if True, use one_hot in embedding lookup and bin-based implementation
        of adaptive softmax.
  perms: a list of tensors. Each tensor should of size [len, bsz, bin_size].
        Only used in the adaptive setting.
  """
  new_mems = []
  with tf.variable_scope(scope):
    if untie_r:
      r_w_bias = tf.get_variable('r_w_bias', [n_layer, n_head, d_head],
                               initializer=initializer)
      r_r_bias = tf.get_variable('r_r_bias', [n_layer, n_head, d_head],
                                 initializer=initializer)
    else:
      r_w_bias = tf.get_variable('r_w_bias', [n_head, d_head],
                                 initializer=initializer)
      r_r_bias = tf.get_variable('r_r_bias', [n_head, d_head],
                                 initializer=initializer)
    qlen = tf.shape(dec_inp)[0]
    mlen = tf.shape(mems[0])[0] if mems is not None else 0
    klen = mlen + qlen
    if proj_initializer is None:
      proj_initializer = initializer
    lookup_fn = (mul_adaptive_embedding_lookup if use_tpu else
                 mask_adaptive_embedding_lookup)
    embeddings, shared_params = lookup_fn(
        x=dec_inp,
        n_token=n_token,
        d_embed=d_embed,
        d_proj=d_model,
        cutoffs=cutoffs,
        initializer=initializer,
        proj_initializer=proj_initializer,
        div_val= div_val,
        perms=input_perms,
        proj_same_dim=proj_same_dim)
    attn_mask = _create_mask(qlen, mlen, same_length)
    pos_seq = tf.range(klen - 1, -1, -1.0)
    if clamp_len > 0:
      pos_seq = tf.minimum(pos_seq, clamp_len)
    inv_freq = 1 / (10000 ** (tf.range(0, d_model, 2.0) / d_model))
    pos_emb = positional_embedding(pos_seq, inv_freq)
    output = tf.layers.dropout(embeddings, dropout, training=is_training)
    pos_emb = tf.layers.dropout(pos_emb, dropout, training=is_training)
    if mems is None:
      mems = [None] * n_layer
    for i in range(n_layer):
      # cache new mems
      new_mems.append(_cache_mem(output, mems[i], mem_len))
      with tf.variable_scope('layer_{}'.format(i)):
        output = rel_multihead_attn(
            w=output,
            r=pos_emb,
            r_w_bias=r_w_bias if not untie_r else r_w_bias[i],
            r_r_bias=r_r_bias if not untie_r else r_r_bias[i],
            attn_mask=attn_mask,
            mems=mems[i],
            d_model=d_model,
            n_head=n_head,
            d_head=d_head,
            dropout=dropout,
            dropatt=dropatt,
            is_training=is_training,
            kernel_initializer=initializer)
        output = positionwise_FF(
            inp=output,
            d_model=d_model,
            d_inner=d_inner,
            dropout=dropout,
            kernel_initializer=initializer,
            is_training=is_training)
    output = tf.layers.dropout(output, dropout, training=is_training)
    logsoftmax_fn = (mul_adaptive_logsoftmax if use_tpu else
                     mask_adaptive_logsoftmax)
    loss = logsoftmax_fn(
        hidden=output,
        target=target,
        n_token=n_token,
        d_embed=d_embed,
        d_proj=d_model,
        cutoffs=cutoffs,
        params=shared_params,
        tie_projs=tie_projs,
        initializer=initializer,
        proj_initializer=proj_initializer,
        div_val=div_val,
        perms=target_perms,
        head_target=head_target,
        proj_same_dim=proj_same_dim)
    return loss, new_mems


目录
相关文章
|
5天前
|
机器学习/深度学习 并行计算 关系型数据库
【RetNet】论文解读:Retentive Network: A Successor to Transformer for Large Language Models
【RetNet】论文解读:Retentive Network: A Successor to Transformer for Large Language Models
16 1
|
5天前
|
机器学习/深度学习 BI
[RoFormer]论文实现:ROFORMER: ENHANCED TRANSFORMER WITH ROTARY POSITION EMBEDDING
[RoFormer]论文实现:ROFORMER: ENHANCED TRANSFORMER WITH ROTARY POSITION EMBEDDING
15 1
|
9月前
|
机器学习/深度学习 人工智能 自然语言处理
RoFormer: Enhanced Transformer with Rotary Position Embedding论文解读
位置编码最近在transformer架构中显示出了有效性。它为序列中不同位置的元素之间的依赖建模提供了有价值的监督。
273 0
|
10月前
|
机器学习/深度学习 人工智能 自然语言处理
【计算机视觉】CORA: Adapting CLIP for Open-Vocabulary Detection with Region Prompting and Anchor Pre-Matching
CORA 在目标检测任务中提出了一种新的 CLIP 预训练模型适配方法,主要包括 Region Prompting 和 Anchor Pre-Matching 两部分。 这种方法能够让 CLIP 模型适应目标检测的任务,能够识别出图像中的对象,并提供准确的分类和定位信息。
|
10月前
|
计算机视觉
【计算机视觉】Grounded Language-Image Pre-training
这篇论文做的任务是phrase grounding,属于Visual grounding的一种。phrase grounding的任务是输入句子和图片,将句子中提到的物体都框出来。
|
11月前
|
数据可视化 数据挖掘
【论文解读】Dual Contrastive Learning:Text Classification via Label-Aware Data Augmentation
北航出了一篇比较有意思的文章,使用标签感知的数据增强方式,将对比学习放置在有监督的环境中 ,下游任务为多类文本分类,在低资源环境中进行实验取得了不错的效果
250 0
|
11月前
|
机器学习/深度学习 数据挖掘
【论文解读】Co-attention network with label embedding for text classification
华南理工出了一篇有意思的文章,将标签和文本进行深度融合,最终形成带标签信息的文本表示和带文本信息的标签表示。
134 1
|
机器学习/深度学习 算法 数据挖掘
【多标签文本分类】Large Scale Multi-label Text Classification with Semantic Word Vectors
【多标签文本分类】Large Scale Multi-label Text Classification with Semantic Word Vectors
105 0
【多标签文本分类】Large Scale Multi-label Text Classification with Semantic Word Vectors
|
机器学习/深度学习 数据挖掘
【多标签文本分类】HFT-CNN: Learning Hierarchical Category Structure for Multi-label Short Text Categorization
【多标签文本分类】HFT-CNN: Learning Hierarchical Category Structure for Multi-label Short Text Categorization
149 0
【多标签文本分类】HFT-CNN: Learning Hierarchical Category Structure for Multi-label Short Text Categorization
|
算法 数据挖掘
【多标签文本分类】BERT for Sequence-to-Sequence Multi-Label Text Classification
【多标签文本分类】BERT for Sequence-to-Sequence Multi-Label Text Classification
【多标签文本分类】BERT for Sequence-to-Sequence Multi-Label Text Classification