论文:Effective Approaches to Attention-based Neural Machine Translation
作者:Minh-Thang Luong, Hieu Pham, Christopher D. Manning
时间:2015
一、完整代码
这里我们使用tensorflow
实现,代码如下:
# 完整代码在这里 import tensorflow as tf import keras_nlp import matplotlib.pyplot as plt import numpy as np import os import random plt.rcParams['font.sans-serif']=['SimHei'] plt.rcParams['axes.unicode_minus']=False # 数据处理 def process_data(x): res = tf.strings.split(x, '\t') return res[1], res[3] # 导入数据 dataset = tf.data.TextLineDataset('./data/transformer_data.tsv') dataset = dataset.map(process_data) # 建立中英文wordpiece词表 vocab_chinese = keras_nlp.tokenizers.compute_word_piece_vocabulary( dataset.map(lambda x, y: x), vocabulary_size=20_000, lowercase=True, strip_accents=True, split_on_cjk=True, reserved_tokens=["[PAD]", "[START]", "[END]", "[MASK]", "[UNK]"], ) vocab_english = keras_nlp.tokenizers.compute_word_piece_vocabulary( dataset.map(lambda x, y: y), vocabulary_size=20_000, lowercase=True, strip_accents=True, split_on_cjk=True, reserved_tokens=["[PAD]", "[START]", "[END]", "[MASK]", "[UNK]"], ) # 构建分词器 chinese_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(vocabulary=vocab_chinese, oov_token="[UNK]") english_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(vocabulary=vocab_english, oov_token="[UNK]") # 再进行一次数据处理 def process_data_(ch, en, maxtoken=128): ch = chinese_tokenizer(ch)[:,:maxtoken] en = english_tokenizer(tf.strings.lower(en))[:,:maxtoken] ch = tf.concat([tf.ones(shape=(64,1), dtype='int32'), ch, tf.ones(shape=(64,1), dtype='int32')*2], axis=-1).to_tensor() en = tf.concat([tf.ones(shape=(64,1), dtype='int32'), en, tf.ones(shape=(64,1), dtype='int32')*2], axis=-1) en_inputs = en[:, :-1].to_tensor() # Drop the [END] tokens en_labels = en[:, 1:].to_tensor() # Drop the [START] tokens return (ch, en_inputs), en_labels dataset = dataset.batch(64).map(process_data_) train_dataset = dataset.take(1000) val_dataset = dataset.skip(500).take(300) # 数据准备完毕 查看数据 for (pt, en), en_labels in dataset.take(1): break print(pt.shape) print(en.shape) print(en_labels.shape) # 构建encoder class Encoder(tf.keras.layers.Layer): def __init__(self, vocabulary_size, d_model, units): super().__init__() self.embedding = tf.keras.layers.Embedding(vocabulary_size, d_model) self.rnn = tf.keras.layers.Bidirectional( layer=tf.keras.layers.LSTM(units=units, return_sequences=True, return_state=False), merge_mode='sum' ) def call(self, inputs): x = inputs x = self.embedding(x) x = self.rnn(x) return x # 构建crossattention class CrossAttention(tf.keras.layers.Layer): def __init__(self, units, **kwargs): super().__init__() self.mha = tf.keras.layers.MultiHeadAttention(key_dim=units, num_heads=1, **kwargs) self.add = tf.keras.layers.Add() self.norm = tf.keras.layers.LayerNormalization() def call(self, inputs): x, context = inputs attention_out, attention_score = self.mha(query=x, value=context, key=context, return_attention_scores=True) self.last_attention_score = attention_score x = self.add([x, attention_out]) x = self.norm(x) return x # 构建decoder class Decoder(tf.keras.layers.Layer): def __init__(self, vocabulary_size, d_model, units, **kwargs): super().__init__() self.embedding = tf.keras.layers.Embedding(vocabulary_size, d_model) self.rnn = tf.keras.layers.LSTM(units, return_sequences=True) self.attention = CrossAttention(units, **kwargs) self.dense = tf.keras.layers.Dense(vocabulary_size, activation='softmax') def call(self, inputs): x, context = inputs x = self.embedding(x) x = self.rnn(x) x = self.attention((x, context)) x = self.dense(x) return x # 构建最后的模型 class Seq2Seq(tf.keras.models.Model): def __init__(self, vocabulary_size_1, vocabulary_size_2, d_model, units, **kwargs): super().__init__() self.encoder = Encoder(vocabulary_size=vocabulary_size_1, d_model=d_model, units=units) self.decoder = Decoder(vocabulary_size=vocabulary_size_2, d_model=d_model, units=units) def call(self, inputs): pt, en = inputs context = self.encoder(pt) output = self.decoder((en, context)) return output seq2seq = Seq2Seq(chinese_tokenizer.vocabulary_size(), english_tokenizer.vocabulary_size(), 512, 30) # 模型总览 seq2seq((pt, en)) seq2seq.summary() # 模型配置 def masked_loss(y_true, y_pred): loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(reduction='none') loss = loss_fn(y_true, y_pred) mask = tf.cast(y_true != 0, loss.dtype) loss *= mask return tf.reduce_sum(loss)/tf.reduce_sum(mask) def masked_acc(y_true, y_pred): y_pred = tf.argmax(y_pred, axis=-1) y_pred = tf.cast(y_pred, y_true.dtype) match = tf.cast(y_true == y_pred, tf.float32) mask = tf.cast(y_true != 0, tf.float32) return tf.reduce_sum(match)/tf.reduce_sum(mask) seq2seq.compile( optimizer='adam', loss=masked_loss, metrics=[masked_acc, masked_loss] ) # 模型训练 seq2seq.fit(train_dataset, epochs=20, validation_data=val_dataset) # 推理 class Inference(tf.Module): def __init__(self, model, tokenizer_1, tokenizer_2): self.model = model self.tokenizer_1 = tokenizer_1 self.tokenizer_2 = tokenizer_2 def __call__(self, sentence, MAX_TOKEN=128): assert isinstance(sentence, tf.Tensor) if len(sentence.shape) == 0: sentence = sentence[tf.newaxis] sentence = self.tokenizer_1(sentence) sentence = tf.concat([tf.ones(shape=[sentence.shape[0], 1], dtype='int32'), sentence, tf.ones(shape=[sentence.shape[0], 1], dtype='int32')*2], axis=-1).to_tensor() encoder_input = sentence start = tf.constant(1, dtype='int64')[tf.newaxis] end = tf.constant(2, dtype='int64')[tf.newaxis] # tf.TensorArray 类似于python中的列表 output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True) # 在index=0的位置写入start output_array = output_array.write(0, start) for i in tf.range(MAX_TOKEN): output = tf.transpose(output_array.stack()) predictions = self.model.predict((encoder_input, output), verbose=0) # Shape `(batch_size, seq_len, vocab_size)` # 从seq_len中的最后一个维度选择last token predictions = predictions[:, -1:, :] # Shape `(batch_size, 1, vocab_size)`. predicted_id = tf.argmax(predictions, axis=-1) # `predicted_id`加入到output_array中作为一个新的输入 output_array = output_array.write(i+1, predicted_id[0]) # 如果输出end就表明停止 if predicted_id == end: break output = tf.squeeze(output_array.stack()) output = self.tokenizer_2.detokenize(output) return output inference = Inference(seq2seq, chinese_tokenizer, english_tokenizer) # 开始推理 sentence = '你好' sentence = tf.constant(sentence) inference(sentence) # 输出 # <tf.Tensor: shape=(), dtype=string, numpy=b"[START] hello ! [END]">
[seq2seq]论文实现:Effective Approaches to Attention-based Neural Machine Translation(下)https://developer.aliyun.com/article/1504076?spm=a2c6h.13148508.setting.40.36834f0eMJOehx