论文:Deep contextualized word representations
作者:Matthew E. Peters, Mark Neumann, Mohit Iyyer, Matt Gardner, Christopher Clark, Kenton Lee, Luke Zettlemoyer
时间:2018
此论文介绍了一种新的深度上下文的词向量表示,这个模型 不仅可以表达词语使用的复杂特征,还可以表达一词多义。 此论文中词向量是一个深度双向语言模型(biLM)的内部状态的学习函数,它是在一个大型文本语料库上预训练的。论文表明,这些表示可以很容易地添加到现有的模型中,并显著改善六个具有挑战性的NLP问题的现状,包括问题回答、文本隐含和情感分析。
一、完整代码
大都是用pytorch实现的,这里使用tensorflow实现
# 完整代码在这里 import tensorflow as tf import numpy as np corpus = [ 'i do"t think you will win', 'you are so bad', 'you are unbelievable', 'its taste is not good', 'good', 'so funny', 'i"m so glad to hear that' ] # 准备vocabulary_char vocabulary_char = list('abcdefghijklmnopqrstuvwxyz0123456789,;.!?:’"/\|_@#$%ˆ&*˜‘+-=<>()[]{}') + ['<bow>','<eow>','<pow>'] vocabulary_char = dict(zip(vocabulary_char, range(len(vocabulary_char)))) # 准备vocabulary_word vocabulary_word = set() for sentence in corpus: vocabulary_word = vocabulary_word.union(set(sentence.split())) vocabulary_word = list(vocabulary_word) + ['<bos>','<eos>','<pos>'] vocabulary_word = dict(zip(vocabulary_word, range(len(vocabulary_word)))) # 数据处理 def data_process(corpus, max_words_num, max_character_num, out='int'): """ max_words_num: 句子最大长度 max_character_num:单词最大字符数 """ sentences = [] words = [] for item in corpus: item = item.split()[:max_words_num] sentences.append(['<bos>'] + item + ['<eos>'] + ['<pos>']*(10-len(item))) for item in sentences: word_list = [] for characters in item: if characters in ['<bos>', '<eos>', '<pos>']: word_list.append(['<pow>']*(max_character_num+2)) else: characters = list(characters)[:max_character_num] word_list.append(['<bow>'] + characters + ['<eow>'] + ['<pow>']*(max_character_num-len(characters))) words.append(word_list) if out == 'int': for i,items in enumerate(sentences): for j,item in enumerate(items): sentences[i][j] = vocabulary_word[item] for i,items in enumerate(words): for j,item in enumerate(items): for k,char in enumerate(item): words[i][j][k] = vocabulary_char[char] return np.array(sentences), np.array(words) sentence, words = data_process(corpus, 10, 6) # sentence.shape, words.shape # ((7, 12), (7, 12, 8)) # 这里准备一下outputs 也就是sentence left shift and right shift y_true = np.c_[sentence[:,1:], np.array([23]*7).reshape(-1,1), sentence[:,:-1], np.array([23]*7).reshape(-1,1)] class Character_layer(tf.keras.layers.Layer): def __init__(self, input_dim, embedding_dim, output_dim, filters_list, kernel_size_list, highway_num): """ input_dim:vocabulary_character 维度 embedding_dim:embedding 维度 output_dim:projection 维度 filters_list:conv1 channels kernel_size_list:conv1 kernel_size highway_num:highway的数量 """ super(Character_layer, self).__init__() self.embedding = tf.keras.layers.Embedding(input_dim, embedding_dim) self.list_conv1 = [] for filters, kernel_size in zip(filters_list, kernel_size_list): conv1 = tf.keras.layers.Conv1D(filters, kernel_size, padding='same') self.list_conv1.append(conv1) self.list_highway = [] self.dim = sum(filters_list) for i in range(highway_num): embedding = tf.keras.layers.Dense(self.dim*2) self.list_highway.append(embedding) self.projection = tf.keras.layers.Dense(output_dim) def build(self, input_shape): pass def call(self, inputs): # Defines the computation from inputs to outputs batch_size, seq_len, token_len = inputs.shape inputs = tf.reshape(inputs, shape=(batch_size*seq_len, token_len)) inputs = self.embedding(inputs) inputs = tf.transpose(inputs, [0,2,1]) outputs_list = [] for cov in self.list_conv1: output = cov(inputs) output = tf.reduce_max(output, axis=1) output = tf.keras.activations.relu(output) outputs_list.append(output) outputs = tf.concat(outputs_list, axis=-1) for highway in self.list_highway: highway = highway(outputs) activation = tf.keras.activations.relu(highway[:,:self.dim]) sigmoid = tf.keras.activations.sigmoid(highway[:,self.dim:]) outputs = activation*sigmoid + outputs*(1-sigmoid) outputs = self.projection(outputs) outputs = tf.reshape(outputs, shape=(batch_size, seq_len, -1)) return outputs class Elmo(tf.keras.models.Model): def __init__(self, lstm_num): super().__init__() self.embedding = Character_layer(len(vocabulary_char),200,50,[12,24,36],[2,3,4],2) self.forward_lstm = [] self.backward_lstm = [] for i in range(lstm_num): self.forward_lstm.append(tf.keras.layers.LSTM(50, return_sequences=True, go_backwards=False)) self.backward_lstm.append(tf.keras.layers.LSTM(50, return_sequences=True, go_backwards=True)) self.forward_lstm.append(tf.keras.layers.LSTM(50, return_sequences=True, go_backwards=False)) self.backward_lstm.append(tf.keras.layers.LSTM(50, return_sequences=True, go_backwards=True)) self.forward_projection = tf.keras.layers.Dense(len(vocabulary_word), activation='softmax') self.backward_projection = tf.keras.layers.Dense(len(vocabulary_word), activation='softmax') def get_result(self, inputs): inputs = self.embedding(inputs) outputs1 = self.forward_lstm[0](inputs) outputs2 = self.backward_lstm[0](inputs) for lstm in self.forward_lstm: outputs1 = lstm(outputs1) for lstm in self.backward_lstm: outputs2 = lstm(outputs2) outputs1 = self.forward_projection(outputs1) outputs2 = self.forward_projection(outputs2) return tf.concat([outputs1, outputs2], axis=1) model = Elmo(5) result = model.get_result(words) loss = tf.keras.losses.SparseCategoricalCrossentropy() optimizer = tf.keras.optimizers.SGD(learning_rate=0.01) def train_one_epoch(i): with tf.GradientTape() as tape: y_predict = model.get_result(words) loss_value = loss(y_true=y_true, y_pred=y_predict) grads = tape.gradient(loss_value, model.trainable_variables) print("Step: {}, Initial Loss: {}".format(i, loss_value.numpy())) optimizer.apply_gradients(zip(grads, model.trainable_variables)) for i in range(100): train_one_epoch(i)
二、论文解读
传统的词向量表示其位置是固定了,不会因为上下文语句不同而产生变化,但是在日常生活中,一词多义是非常常见的,例如“apple”这个词,既可以表示苹果,也可以表示公司;这篇论文解决了单词的一词多义问题,即词的位置可以通过上下文的不同而发生改变;
ELMo具有深层次的网络结构,其使用了多层的LSTM,这比仅仅使用单层LSTM层显著提高了性能;同时由于LSTM是循环神经网络,其可以捕捉单词意义的上下文;大量的实验表明,ELMo表示在实践中工作得非常好。,它可以很容易地添加到现有的模型中,用于6个不同的和具有挑战性的语言理解问题,包括文本蕴涵、问题回答和情感分析。
2.1 模型结构
这一部分讲解参考了ELMo解读(论文 + PyTorch源码),并添加了一些内容;
ELMo 预训练模型结构如下:
2.1.1 Character Encode Layer
Character Encode Layer结构如下所示:
首先对Input Sentence进行分析;举个例子,假设我们有一句话"i like this dog very much",解析后得到结果如下:
其做法就是固定句子长度为W:num_words
,固定单词长度为C:max_chars_per_token
;
- 对句子:在句子开头和结尾分别加上
<bos>
和<eos>
,不足长度在后面添加<pos>
; - 对单词:在单词开头和结尾分别加上
<bow>
和<eow>
,不足长度在后面添加<pow>
然后对句子和单词分别建立词表:vocabulary_words
和vocabulary_chars
,将setence和word都转化为数字形式;
以上便是数据准备部分;
然后接下来各个层的介绍我直接粘贴ELMo解读(论文 + PyTorch源码):
2.1.2 N-BiLSTM Layer
LSTM是一个循环网络结构;这里使用的是双向网络循环;原文太详细,这里还是粘贴一下ELMo解读(论文 + PyTorch源码):
在这篇论文中,分别训练了多个正向的LSTM和多个负向的LSTM;把最后的输出合并再进行投影就可以训练了;模型如下所示:
可以看到这里的outputs分别是word的right shift和left shift的合并;训练完毕后,我们就可以利用这些参数进行词表示了;
2.1.3 词表示
从图中可以看出,ELMo此表示是由 input sentence embedding 以及多个 隐藏层(BiLSTM) 表示的,其中输出表示层最后的维度应该是D,而每一层的BiLSTM由于由两个LSTM表示,再加上每一个LSTM都线性投影成D,则每一层最后一维度都是2D,到最后应该有 D + L ∗ 2 D D+L * 2D D+L∗2D;为了方便计算,这里我们对最后一层采取复制的方式,最后得到总维度应该是 ( L + 1 , B , W , 2 D ) (L+1,B,W,2D) (L+1,B,W,2D);模型介绍完毕!
三、过程实现
3.1 导包和数据整备
代码如下:
import tensorflow as tf import numpy as np corpus = [ 'i do"t think you will win', 'you are so bad', 'you are unbelievable', 'its taste is not good', 'good', 'so funny', 'i"m so glad to hear that' ] # 准备vocabulary_char vocabulary_char = list('abcdefghijklmnopqrstuvwxyz0123456789,;.!?:’"/\|_@#$%ˆ&*˜‘+-=<>()[]{}') + ['<bow>','<eow>','<pow>'] vocabulary_char = dict(zip(vocabulary_char, range(len(vocabulary_char)))) # 准备vocabulary_word vocabulary_word = set() for sentence in corpus: vocabulary_word = vocabulary_word.union(set(sentence.split())) vocabulary_word = list(vocabulary_word) + ['<bos>','<eos>','<pos>'] vocabulary_word = dict(zip(vocabulary_word, range(len(vocabulary_word)))) # 数据处理 def data_process(corpus, max_words_num, max_character_num, out='int'): """ max_words_num: 句子最大长度 max_character_num:单词最大字符数 """ sentences = [] words = [] for item in corpus: item = item.split()[:max_words_num] sentences.append(['<bos>'] + item + ['<eos>'] + ['<pos>']*(10-len(item))) for item in sentences: word_list = [] for characters in item: if characters in ['<bos>', '<eos>', '<pos>']: word_list.append(['<pow>']*(max_character_num+2)) else: characters = list(characters)[:max_character_num] word_list.append(['<bow>'] + characters + ['<eow>'] + ['<pow>']*(max_character_num-len(characters))) words.append(word_list) if out == 'int': for i,items in enumerate(sentences): for j,item in enumerate(items): sentences[i][j] = vocabulary_word[item] for i,items in enumerate(words): for j,item in enumerate(items): for k,char in enumerate(item): words[i][j][k] = vocabulary_char[char] return np.array(sentences), np.array(words) sentence, words = data_process(corpus, 10, 6) # sentence.shape, words.shape # ((7, 12), (7, 12, 8)) # 这里准备一下outputs 也就是sentence left shift and right shift y_true = np.c_[sentence[:,1:], np.array([23]*7).reshape(-1,1), sentence[:,:-1], np.array([23]*7).reshape(-1,1)]
3.2 Character Encode Layer
代码如下:
class Character_layer(tf.keras.layers.Layer): def __init__(self, input_dim, embedding_dim, output_dim, filters_list, kernel_size_list, highway_num): """ input_dim:vocabulary_character 维度 embedding_dim:embedding 维度 output_dim:projection 维度 filters_list:conv1 channels kernel_size_list:conv1 kernel_size highway_num:highway的数量 """ super(Character_layer, self).__init__() self.embedding = tf.keras.layers.Embedding(input_dim, embedding_dim) self.list_conv1 = [] for filters, kernel_size in zip(filters_list, kernel_size_list): conv1 = tf.keras.layers.Conv1D(filters, kernel_size, padding='same') self.list_conv1.append(conv1) self.list_highway = [] self.dim = sum(filters_list) for i in range(highway_num): embedding = tf.keras.layers.Dense(self.dim*2) self.list_highway.append(embedding) self.projection = tf.keras.layers.Dense(output_dim) def build(self, input_shape): pass def call(self, inputs): # Defines the computation from inputs to outputs batch_size, seq_len, token_len = inputs.shape inputs = tf.reshape(inputs, shape=(batch_size*seq_len, token_len)) inputs = self.embedding(inputs) inputs = tf.transpose(inputs, [0,2,1]) outputs_list = [] for cov in self.list_conv1: output = cov(inputs) output = tf.reduce_max(output, axis=1) output = tf.keras.activations.relu(output) outputs_list.append(output) outputs = tf.concat(outputs_list, axis=-1) for highway in self.list_highway: highway = highway(outputs) activation = tf.keras.activations.relu(highway[:,:self.dim]) sigmoid = tf.keras.activations.sigmoid(highway[:,self.dim:]) outputs = activation*sigmoid + outputs*(1-sigmoid) outputs = self.projection(outputs) outputs = tf.reshape(outputs, shape=(batch_size, seq_len, -1)) return outputs
3.3 N-BiLSTM Layer
代码如下:
class Elmo(tf.keras.models.Model): def __init__(self, lstm_num): super().__init__() self.embedding = Character_layer(len(vocabulary_char),200,50,[12,24,36],[2,3,4],2) self.forward_lstm = [] self.backward_lstm = [] for i in range(lstm_num): self.forward_lstm.append(tf.keras.layers.LSTM(50, return_sequences=True, go_backwards=False)) self.backward_lstm.append(tf.keras.layers.LSTM(50, return_sequences=True, go_backwards=True)) self.forward_lstm.append(tf.keras.layers.LSTM(50, return_sequences=True, go_backwards=False)) self.backward_lstm.append(tf.keras.layers.LSTM(50, return_sequences=True, go_backwards=True)) self.forward_projection = tf.keras.layers.Dense(len(vocabulary_word), activation='softmax') self.backward_projection = tf.keras.layers.Dense(len(vocabulary_word), activation='softmax') def get_result(self, inputs): inputs = self.embedding(inputs) outputs1 = self.forward_lstm[0](inputs) outputs2 = self.backward_lstm[0](inputs) for lstm in self.forward_lstm: outputs1 = lstm(outputs1) for lstm in self.backward_lstm: outputs2 = lstm(outputs2) outputs1 = self.forward_projection(outputs1) outputs2 = self.forward_projection(outputs2) return tf.concat([outputs1, outputs2], axis=1)
3.4 模型训练
代码如下:
model = Elmo(5) result = model.get_result(words) loss = tf.keras.losses.SparseCategoricalCrossentropy() optimizer = tf.keras.optimizers.SGD(learning_rate=0.01) def train_one_epoch(i): with tf.GradientTape() as tape: y_predict = model.get_result(words) loss_value = loss(y_true=y_true, y_pred=y_predict) grads = tape.gradient(loss_value, model.trainable_variables) print("Step: {}, Initial Loss: {}".format(i, loss_value.numpy())) optimizer.apply_gradients(zip(grads, model.trainable_variables)) for i in range(100): train_one_epoch(i)
四、整体总结
内容参考:
- https://zhuanlan.zhihu.com/p/466841781
- https://blog.csdn.net/Magical_Bubble/article/details/89160032