项目说明,本项目是李宏毅老师在飞桨授权课程的作业解析
课程 传送门
该项目AiStudio项目 传送门
数据集 传送门
本项目仅用于参考,提供思路和想法并非标准答案!请谨慎抄袭!
语句分类——监督式学习 (只使用training_label.txt进行学习)
三岁出品,必是精品!
项目简介:
李宏毅老师课程配套作业,应大家要求把作业进行一个梳理,老规矩全流程全代码注释加解析!
学习指南
除了课程还可以参考《零基础实践深度学习第六章:情感分析文本情感倾向性分析》传送门
项目描述
- 本次作业是要让同学接触自然语言处理当中一个简单的任务 —— 语句分类(文本分类)
- 给定一个语句,判断他有没有恶意(负面标 1,正面标 0)
数据集介绍
有三个文件,分别是 training_label.txt、training_nolabel.txt、testing_data.txt
- training_label.txt:有标签的训练数据(句子配上 0 or 1,+++$+++ 只是分隔符号,不要理它)
- e.g., 1 +++$+++ are wtf … awww thanks !
- training_nolabel.txt:没有标签的训练数据(只有句子),用来做半监督学习
- ex: hates being this burnt !! ouch
- testing_data.txt:你要判断测试数据里面的句子是 0 or 1
id,text
0,my dog ate our dinner . no , seriously … he ate it .
1,omg last day sooon n of primary noooooo x im gona be swimming out of school wif the amount of tears am gona cry
2,stupid boys … they ’ re so … stupid !
项目要求
- 用一些方法 pretrain 出 word embedding (e.g., skip-gram, CBOW. )
- 请使用 RNN 实现文本分类
- 不能使用额外 data (禁止使用其他 corpus 或 pretrained model)
数据准备
无
环境配置/安装
数据处理!
import paddle import numpy as np import matplotlib.pyplot as plt import paddle.nn as nn import os import random import numpy as np print(paddle.__version__) # 查看当前版本
2.0.1
! unzip -oq /home/aistudio/data/data78806/语句分类.zip
data = open('语句分类/training_label.txt') for i in data: print(i.split('+++$+++')) print(i.split(' ')[0]) print(i.split('+++$+++')[-1].replace('\n', '').split(' ')) break
['1 ', ' are wtf ... awww thanks !\n'] 1 ['', 'are', 'wtf', '...', 'awww', 'thanks', '!']
字典生成
def open_label(data_path): # 打开文件并读取数据 ''' 这里数据读取适合与‘label’和‘nolabel’,因为data文件数据是标签 + 文本 读取出来的数据不正确 ''' dict_set = set() # 新建一个集合(内容不可重复) with open(data_path, 'r', encoding='utf-8') as f: # 读取数据到f for line in f.readlines(): # 逐行获取f的内容 # print(1) title = line.split('+++$+++')[-1].replace('\n', '').split(' ') # 此处效果查看上一个code for s in title[1:]: # 循环内容 dict_set.add(s) # 数据存入集合 # print(dict_set) print(f'{data_path}数据读取完毕') return dict_set def open_data(data_path): # 打开文件并读取数据 dict_set = set() # 新建一个集合(内容不可重复) with open(data_path, 'r', encoding='utf-8') as f: # 读取数据到f for line in f.readlines(): # 逐行获取f的内容 # print(1) title = line.replace(',', ' ').replace('\n', '').split(' ') for s in title[1:]: # 循环内容 dict_set.add(s) # 数据存入集合 # print(dict_set) print(f'{data_path}数据读取完毕') return dict_set def create_dict(label_path, nolabel_path, data_path, dict_path): ''' 把数据生成字典 ''' print('正在读取数据……') label_data = open_label(label_path) # 读取数据 nolabel_data = open_label(nolabel_path) data_data = open_data(data_path) dict_set = label_data | nolabel_data | data_data # 使用并集获得数据 print('正在生产字典……') dict_list = [] # 新建一个空列表 i = 0 for s in dict_set: # 循环集合 dict_list.append([s, i]) # 存入集合并新建一个单独对应的数值 i += 1 dict_txt = dict(dict_list) # 强制转换成 字典 end_dict = {"<unk>": i} # 添加未知字符<unk> dict_txt.update(end_dict) # 空字符数据存入 # 把这些字典保存到本地中 with open(dict_path, 'w', encoding='utf-8') as f: f.write(str(dict_txt)) # 把字典存入文本 print("数据字典生成完成!") print(f'字典长度为:{len(dict_txt.keys())}') return dict_txt label_path = '语句分类/training_label.txt' # 把地址进行赋值 data_path = '语句分类/testing_data.txt' nolabel_path = '语句分类/training_nolabel.txt' dict_path = './dict.txt' word_dict = create_dict(label_path, nolabel_path, data_path, dict_path)
正在读取数据…… 语句分类/training_label.txt数据读取完毕 语句分类/training_nolabel.txt数据读取完毕 语句分类/testing_data.txt数据读取完毕 正在生产字典…… 数据字典生成完成! 字典长度为:245076
print(word_dict['deux']) # 选择查看 print(word_dict['<unk>']) # 选择查看
212878 245075
数据集处理
# 创建数据集和数据字典 with open('语句分类/training_label.txt', 'r', encoding='utf-8') as f_data: # 读取文件 data_list = [] # 新建列表 for line in f_data.readlines(): # 逐行读取 # print(line) title = line.split('+++$+++')[-1].replace('\n', '').split(' ') # 获取文本数据列表 l = line.split()[0] # 获取标签 labs = "" for s in title[1:]: # 循环得到数据 lab = str(word_dict[s]) # 获取数据对应标签 labs = labs + lab + ',' # 逐个添加数据并加上, labs = labs[:-1] # 去掉最后一个, labs = labs + '\t' + l + '\n' # 获得文本数据加上tab加上标签加上换行 data_list.append(labs) # 添加进数据 random.shuffle(data_list) # 打乱数据 val_len = int(len(data_list)*0.8) # 按照8:2进行切割 val_data = data_list[val_len:] # 后面20% train_data = data_list[:val_len] # 前面80% with open('./val.txt', 'w', encoding='utf-8') as f_val: f_val.write(str(val_data)) # 写入数据 with open('./train.txt', 'w', encoding='utf-8') as f_train: f_train.write(str(train_data)) print("数据列表生成完成!") print(f'训练集数据{len(train_data)}条') print(f'验证集数据{len(val_data)}条')
数据列表生成完成! 训练集数据160000条 验证集数据40000条
vocab_size = len(word_dict) + 1 # 字典长度加一 备用 print(vocab_size) seq_len = 35 # 数据集长度(需要扩充的长度) batch_size = 32 # 批处理大小 epochs = 2 # 训练轮数 learning_rate=0.001 # 学习率 pad_id = word_dict['<unk>'] # 空的填充内容值 # 生成句子列表 def ids_to_str(ids): # print(ids) words = [] for k in ids: # 循环列表 w = list(word_dict)[eval(k)] # 获取对应索引的值 words.append(w if isinstance(w, str) else w.decode('ASCII')) # 写入数据 return " ".join(words) # 返回拼接的数据
245077
for i in train_data: # 循环数据 # i = i print(i) # 输出原始数据 sent = i[:-3].split(',') # 数据处理 label = int(i[-2]) # 获取标签 print('sentence list id is:', sent) print('sentence label id is:', label) print('--------------------------') print('sentence list is: ', ids_to_str(sent)) print('sentence label is: ', label) break
28344,215624,119320,179559,1357,198253,44052 1 sentence list id is: ['28344', '215624', '119320', '179559', '1357', '198253', '44052'] sentence label id is: 1 -------------------------- sentence list is: haha ! some things never change . sentence label is: 1
# 读取数据扩充并查看 def create_padded_dataset(dataset): padded_sents = [] labels = [] for batch_id, data in enumerate(dataset): # 循环得到数据 data = data.replace('\n', '').replace('\t', ',').split(',') # 对数据做处理得到列表 # print(data) sent, label = data[:-2], data[-1] # 返回数字化文本及标签 padded_sent = np.concatenate([sent[:seq_len], [pad_id] * (seq_len - len(sent))]).astype('int64') # 数字化文本扩充 # print(padded_sent) padded_sents.append(padded_sent) # 添加到数字化文本列表 labels.append(label) # 添加到标签列表 # print(padded_sents) return np.array(padded_sents), np.array(labels).astype('int64').reshape(len(labels),1) # 返回数据 # 对train、val数据进行实例化 train_sents, train_labels = create_padded_dataset(train_data) val_sents, val_labels = create_padded_dataset(val_data) # 查看数据大小 print(train_sents.shape) print(train_labels.shape) print(val_sents.shape) print(val_labels.shape)
(160000, 35) (160000, 1) (40000, 35) (40000, 1)
for i in train_sents: # 查看train数据 print(i) break
[ 10374 140526 101681 126741 92085 215624 89809 186808 128118 88456 63414 125783 92085 245075 245075 245075 245075 245075 245075 245075 245075 245075 245075 245075 245075 245075 245075 245075 245075 245075 245075 245075 245075 245075 245075]
# 继承paddle.io.Dataset对数据进行处理 class IMDBDataset(paddle.io.Dataset): ''' 继承paddle.io.Dataset类进行封装数据 ''' def __init__(self, sents, labels): # 数据读取 self.sents = sents self.labels = labels def __getitem__(self, index): # 数据处理 data = self.sents[index] label = self.labels[index] return data, label def __len__(self): # 返回大小数据 return len(self.sents) # 数据实例化 train_dataset = IMDBDataset(train_sents, train_labels) val_dataset = IMDBDataset(val_sents, val_labels) # 封装成生成器 train_loader = paddle.io.DataLoader(train_dataset, return_list=True, shuffle=True, batch_size=batch_size, drop_last=True) val_loader = paddle.io.DataLoader(val_dataset, return_list=True, shuffle=True, batch_size=batch_size, drop_last=True)
for i in train_dataset: # 查看封装前的数据 print(i) break
(array([ 10374, 140526, 101681, 126741, 92085, 215624, 89809, 186808, 128118, 88456, 63414, 125783, 92085, 245075, 245075, 245075, 245075, 245075, 245075, 245075, 245075, 245075, 245075, 245075, 245075, 245075, 245075, 245075, 245075, 245075, 245075, 245075, 245075, 245075, 245075]), array([1]))
for i in train_loader : # 查看封装后的数据 print(i) break
[Tensor(shape=[32, 35], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True, [[10374 , 211351, 76510 , ..., 245075, 245075, 245075], [92372 , 181484, 214902, ..., 245075, 245075, 245075], [191117, 190423, 187643, ..., 245075, 245075, 245075], ..., [98834 , 189058, 55023 , ..., 245075, 245075, 245075], [122137, 74773 , 142631, ..., 245075, 245075, 245075], [1702 , 6849 , 55023 , ..., 245075, 245075, 245075]]), Tensor(shape=[32, 1], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True, [[0], [1], [0], [1], [1], [1], [0], [1], [0], [0], [1], [1], [1], [1], [0], [1], [0], [0], [0], [0], [0], [1], [0], [1], [1], [0], [0], [0], [1], [0], [1], [0]])]
搭建网络
样本出现的时间顺序对于自然语言处理、语音识别、手写体识别等应用非常重要。 对了适应这种需求,就出现了题主所说的另一种神经网络结构——循环神经网络RNN。
本示例中,我们将会使用一个序列特性的RNN网络,在查找到每个词对应的embedding后,简单的取平均,作为一个句子的表示。然后用Linear进行线性变换。为了防止过拟合,我们还使用了Dropout。
RNN对具有序列特性的数据非常有效,它能挖掘数据中的时序信息以及语义信息,利用了RNN的这种能力,使深度学习模型在解决语音识别、语言模型、机器翻译以及时序分析等NLP领域的问题时有所突破。
在普通的全连接网络或CNN中,每层神经元的信号只能向上一层传播,样本的处理在各个时刻独立,因此又被成为前向神经网络(Feed-forward Neural Networks)。而在RNN中,神经元的输出可以在下一个时间戳直接作用到自身,即第i层神经元在m时刻的输入,除了(i-1)层神经元在该时刻的输出外,还包括其自身在(m-1)时刻的输出!表示成图就是这样的:
广义的可以这样理解:
class MyRNN(paddle.nn.Layer): def __init__(self): super(MyRNN, self).__init__() self.embedding = nn.Embedding(vocab_size, 256) # 嵌入层用于自动构造一个二维embedding矩阵 self.rnn = nn.SimpleRNN(256, 256, num_layers=2, direction='forward',dropout=0.5) # 使用简易的RNN模型 self.linear = nn.Linear(in_features=256*2, out_features=2) # 分类器 self.dropout = nn.Dropout(0.5) # 正则化 def forward(self, inputs): emb = self.dropout(self.embedding(inputs)) #output形状大小为[batch_size,seq_len,num_directions * hidden_size] #hidden形状大小为[num_layers * num_directions, batch_size, hidden_size] #把前向的hidden与后向的hidden合并在一起 output, hidden = self.rnn(emb) hidden = paddle.concat((hidden[-2,:,:], hidden[-1,:,:]), axis = 1) #hidden形状大小为[batch_size, hidden_size * num_directions] hidden = self.dropout(hidden) return self.linear(hidden)
# 可视化定义 def draw_process(title,color,iters,data,label): plt.title(title, fontsize=24) # 标题 plt.xlabel("iter", fontsize=20) # x轴 plt.ylabel(label, fontsize=20) # y轴 plt.plot(iters, data,color=color,label=label) # 画图 plt.legend() plt.grid() plt.show()
# 训练模型 def train(model): model.train() opt = paddle.optimizer.Adam(learning_rate=learning_rate, parameters=model.parameters()) # 优化器学习率等 # 初始值设置 steps = 0 Iters, total_loss, total_acc = [], [], [] for epoch in range(epochs): # 训练循环 for batch_id, data in enumerate(train_loader): # 数据循环 steps += 1 sent = data[0] # 获取数据 label = data[1] # 获取标签 logits = model(sent) # 输入数据 loss = paddle.nn.functional.cross_entropy(logits, label) # loss获取 acc = paddle.metric.accuracy(logits, label) # acc获取 if batch_id % 500 == 0: # 每500次输出一次结果 Iters.append(steps) # 保存训练轮数 total_loss.append(loss.numpy()[0]) # 保存loss total_acc.append(acc.numpy()[0]) # 保存acc print("epoch: {}, batch_id: {}, loss is: {}".format(epoch, batch_id, loss.numpy())) # 输出结果 # 数据更新 loss.backward() opt.step() opt.clear_grad() # 每一个epochs进行一次评估 model.eval() accuracies = [] losses = [] for batch_id, data in enumerate(val_loader): # 数据循环读取 sent = data[0] # 训练内容读取 label = data[1] # 标签读取 logits = model(sent) # 训练数据 loss = paddle.nn.functional.cross_entropy(logits, label) # loss获取 acc = paddle.metric.accuracy(logits, label) # acc获取 accuracies.append(acc.numpy()) # 添加数据 losses.append(loss.numpy()) avg_acc, avg_loss = np.mean(accuracies), np.mean(losses) # 获取loss、acc平均值 print("[validation] accuracy: {}, loss: {}".format(avg_acc, avg_loss)) # 输出值 model.train() paddle.save(model.state_dict(),str(epoch)+"_model_final.pdparams") # 保存训练文件 draw_process("trainning loss","red",Iters,total_loss,"trainning loss") # 画处loss图 draw_process("trainning acc","green",Iters,total_acc,"trainning acc") # 画出caa图 model = MyRNN() # 模型实例化 train(model) # 开始训练
epoch: 0, batch_id: 0, loss is: [0.69794196] epoch: 0, batch_id: 500, loss is: [0.74246097] epoch: 0, batch_id: 1000, loss is: [0.7177408] epoch: 0, batch_id: 1500, loss is: [0.67342] epoch: 0, batch_id: 2000, loss is: [0.70416176] epoch: 0, batch_id: 2500, loss is: [0.6862659] epoch: 0, batch_id: 3000, loss is: [0.7122611] epoch: 0, batch_id: 3500, loss is: [0.7753513] epoch: 0, batch_id: 4000, loss is: [0.7068041] epoch: 0, batch_id: 4500, loss is: [0.7294261] [validation] accuracy: 0.4728749990463257, loss: 0.7290785908699036 epoch: 1, batch_id: 0, loss is: [0.76394814] epoch: 1, batch_id: 500, loss is: [0.7539971] epoch: 1, batch_id: 1000, loss is: [0.73696876] epoch: 1, batch_id: 1500, loss is: [0.69534796] epoch: 1, batch_id: 2000, loss is: [0.7292723] epoch: 1, batch_id: 2500, loss is: [0.6313175] epoch: 1, batch_id: 3000, loss is: [0.7186656] epoch: 1, batch_id: 3500, loss is: [0.68441] epoch: 1, batch_id: 4000, loss is: [0.6846897] epoch: 1, batch_id: 4500, loss is: [0.7723298] [validation] accuracy: 0.5213500261306763, loss: 0.6922441124916077 /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/matplotlib/cbook/__init__.py:2349: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working if isinstance(obj, collections.Iterator): /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages/matplotlib/cbook/__init__.py:2366: DeprecationWarning: Using or importing the ABCs from 'collections' instead of from 'collections.abc' is deprecated, and in 3.8 it will stop working return list(data) if isinstance(data, collections.MappingView) else data
模型评估
''' 模型评估 ''' model_state_dict = paddle.load('1_model_final.pdparams') # 导入模型 model = MyRNN() # 网络实例化 model.set_state_dict(model_state_dict) model.eval() accuracies = [] losses = [] for batch_id, data in enumerate(val_loader): # 循环val数据 sent = data[0] # 获得数据 label = data[1] # 获得标签 logits = model(sent) # 进行预测 loss = paddle.nn.functional.cross_entropy(logits, label) # 获得loss acc = paddle.metric.accuracy(logits, label) # 获得acc accuracies.append(acc.numpy()) # 保存loss losses.append(loss.numpy()) # 保存acc avg_acc, avg_loss = np.mean(accuracies), np.mean(losses) # 获取loss和acc均值 print("[validation] accuracy: {}, loss: {}".format(avg_acc, avg_loss))
[validation] accuracy: 0.5213500261306763, loss: 0.6922440528869629
for i in open('语句分类/testing_data.txt'): # 查看首行数据 print(i) break
id,text
测试集数据处理及推理
with open('语句分类/testing_data.txt', 'r', encoding='utf-8') as f_data: # 读取文件 data_list = [] # 新建列表 for line in f_data.readlines(): # 逐行读取 # print(line) title = line.replace(',', ' ').replace('\n', '').split(' ') # 数据处理(把序列号和内容进行删除) # print(title) if title[0] != 'id': labs = [] for s in title[1:]: # 循环得到数据 lab = str(word_dict[s]) # 获取数据对应标签 labs.append(lab) # 逐个添加数据并加上, # print(labs) data_list.append(labs) # 添加进数据 # print(data_list) def create_padded_dataset(dataset): padded_sents = [] labels = [] for batch_id, data in enumerate(dataset): # 循环得到数据 # print(data) sent = data # 返回数字化文本及标签 padded_sent = np.concatenate([sent[:seq_len], [pad_id] * (seq_len - len(sent))]).astype('int64') # 数字化文本扩充 # print(padded_sent) padded_sents.append(padded_sent) # 添加到数字化文本列表 # print(padded_sents) return np.array(padded_sents) # 返回数据 test_loader = create_padded_dataset(data_list)
# 生成句子列表 def ids_to_str(ids): # print(ids) words = [] for k in ids: # 循环列表 w = list(word_dict)[k] # 获取对应索引的值 words.append(w if isinstance(w, str) else w.decode('ASCII')) # 写入数据 return " ".join(words) # 返回拼接的数据 label_map = {0:"negative", 1:"positive"} model_state_dict = paddle.load('1_model_final.pdparams') # 导入模型 model = MyRNN() # 实例化网络 model.set_state_dict(model_state_dict) model.eval() for batch_id, data in enumerate(test_loader): # 循环数据 # print(data) results = model(paddle.to_tensor(data.reshape(1,32))) # 开始训练 predictions = [] for probs in results: # 映射分类label idx = np.argmax(probs) # 获取标签 labels = label_map[idx] # 判断类型 predictions.append(labels) for i,pre in enumerate(predictions): print(' 数据: {} \n 情感: {}'.format(ids_to_str(data.tolist()), pre)) print(idx) break break
数据: my dog ate our dinner . no seriously ... he ate it . <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> 情感: negative 0
传说中的飞桨社区最菜代码人,让我们一起努力!
记住:三岁出品必是精品 (不要脸系列)