1 相关信息
- 【NLP】讯飞英文学术论文分类挑战赛Top10开源多方案–1 赛后总结与分析
- 【NLP】讯飞英文学术论文分类挑战赛Top10开源多方案–2 数据分析
- 【NLP】讯飞英文学术论文分类挑战赛Top10开源多方案–3 TextCNN Fasttext 方案
- 【NLP】讯飞英文学术论文分类挑战赛Top10开源多方案–4 机器学习LGB 方案
- 【NLP】讯飞英文学术论文分类挑战赛Top10开源多方案–5 Bert 方案
- 【NLP】讯飞英文学术论文分类挑战赛Top10开源多方案–6 提分方案
2 引言
(1)拿到数据,用的第一个方案就是Fasttext去跑一个Baseline出来,Fasttext训练贼快,能够在短时间内构建一个baseline。以下实现采用预训练词向量的方式去实现的baseline,另一种不使用词向量的词嵌入的方式可以使用torchtext的data.Field去实现。这里不做讲述。
预训练词向量方案的处理步骤
- 数据预处理
- 去换行符
- 去特殊符号
- 删除单个字符
- 删除多个空格
- 数字泛化
- 字母小写
- 词性还原
- 去停用词
- 选择分词工具
- from nltk import WordPunctTokenizer
- 选择Padding工具
- tf.keras.preprocessing.sequence.pad_sequences(sequence_train, maxlen=args.embedding_size)
- 选择给单词编码的工具
- tf.keras.preprocessing.text.Tokenizer(lower=True)
- 用预训练词向量的方式训练word2vec、fasttext、glove词向量模型,再根据单词编码获得词向量嵌入矩阵
- 选择网络结构
- TextCNN
- Fasttext
- DPCNN
- TextRNN
- 开始训练
- 初始化网络权重
- 交叉验证
- 对抗训练
- FGM
- FGD
(2)我重点调试了TextCNN和FastText模型,在加上使用预处理过程,交叉验证方式后,分别达到0.78+和0.77+的准确率,在加入高质量的伪标签数据后,得到0.8076和0.8070的准确率。说明数值的质量很大部分决定了精度。其他的网络结构和参数的选择都是小的提分点。由于时间有限,TextRNN和DPCNN没有调参,线上精度并不高,在任务中就没有使用这两种模型。
(3)词向量矩阵通过对比word2vec、fasttext、glove以及三种各128维度合并的词向量矩阵。发现单个word2vec 128维度最佳。训练参数如下
iter = 20
min_count = 3
sample = 1e-5
window = 5
(4)尝试过对抗训练方法,有FGM和FGD两种方法,FGD效果稍微较好,但是epoch必须得大,20以上,加入了对抗训练后,训练速度很慢,收敛也很慢。
3 方案
3.1 加载包
安装包
!pip install nltk
!pip install gensim
!pip install python-Levenshtein
!pip install tensorflow-gpu
!pip install glove-python-binary
加载包
from nltk import WordPunctTokenizer
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
# import tensorflow as tf
from tqdm import tqdm
from torch.utils import data
import nltk
import re
from nltk.stem import WordNetLemmatizer
import pickle
import torch
import os
import numpy as np
import torch
import torch.nn as nn
import pandas as pd
import torch.nn.functional as F
from sklearn import metrics
from torch.utils.data import DataLoader
cache_dir = 'cache'
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
3.2 参数设置
class Config(object):
def __init__(self):
self.train_all = "data/train.csv"
# 生成的预处理后的数据,为了加快二次启动程序
self.train_path = "predata/train_x.csv"
self.valid_path = "predata/valid_x.csv"
self.testall_path = "data/test.csv"
self.test_path = "predata/test_x.csv"
# label的类别
self.label_path = "predata/label_id2cate.pkl"
# 生成的预处理后的数据,为了加快二次启动程序
self.process_trainset_path = "predata/train_set.npy"
self.process_trainlabel_path = "predata/train_label.npy"
self.process_testset_path = "predata/test_set.npy"
# 词向量模型
self.fastText_path = "model/fasttext.bin"
self.word2vec_path = "model/word2vec.bin"
# 嵌入长度
self.embedding_size = 128
# 词典最大词数
self.max_vocab_size = 50000
# 词向量维度
self.max_len = 128
# 分类类别数
self.num_class = 39
# 模型保存路径
self.save_path = "saved/"
self.batch_size = 1000
self.lr = 0.001
self.num_epochs = 8# 50
# 可选FastText或TextCNN
self.model = "TextCNN"
args = Config()
3.3 TextCNN 网络结构
class TextCNN(nn.Module):
def __init__(self, args, pretrained_path):
super(TextCNN, self).__init__()
self.dim_embed = args.embedding_size
# 在文本分类任务中,测试0.6模型精确率最高
self.dropout = 0.6
self.num_filters = 256
self.kernel_size = (4, 5, 3)
self.max_len = args.max_len
self.n_vocab = pretrained_path.shape[0] # 不使用预训练词向量时的词典长度
self.num_classes = args.num_class# f分类类别数
self.pretrained = True
self.pretrained_path = pretrained_path
if self.pretrained:
self.embedding = nn.Embedding.from_pretrained(self.pretrained_path, freeze=False)
else:
self.embedding = nn.Embedding(self.n_vocab, self.dim_embed, padding_idx=self.n_vocab - 1)
self.conv1 = nn.Conv2d(1, self.num_filters, (self.kernel_size[0], self.dim_embed))
self.conv2 = nn.Conv2d(1, self.num_filters, (self.kernel_size[1], self.dim_embed))
self.conv3 = nn.Conv2d(1, self.num_filters, (self.kernel_size[2], self.dim_embed))
self.max_pool1 = nn.MaxPool2d((self.max_len - self.kernel_size[0] + 1, 1))
self.max_pool2 = nn.MaxPool2d((self.max_len - self.kernel_size[1] + 1, 1))
self.max_pool3 = nn.MaxPool2d((self.max_len - self.kernel_size[1] + 1, 1))
self.dropout = nn.Dropout(self.dropout)
self.fc = nn.Linear(self.num_filters * 3, self.num_classes)
def forward(self, x):
batch_size = x.shape[0]
x = self.embedding(x) # [batch_size, max_len, dim_embed]
x = x.unsqueeze(1) # [batch_size, 1, max_len, dim_embed]
x1 = F.relu(self.conv1(x)) # [batch_size, num_filters, max_len-kernel_size[0], 1]
x2 = F.relu(self.conv2(x)) # [batch_size, num_filters, max_len-kernel_size[1], 1]
x3 = F.relu(self.conv3(x)) # [batch_size, num_filters, max_len-kernel_size[2], 1]
x1 = self.max_pool1(x1) # [batch_size, num_filters, 1, 1]
x2 = self.max_pool2(x2) # [batch_size, num_filters, 1, 1]
x3 = self.max_pool3(x3) # [batch_size, num_filters, 1, 1]
x = torch.cat((x1, x2, x3), -1) # [batch_size, num_filters, 1, 3]
x = x.view(batch_size, 1, -1) # [batch_size, 1, num_filters*3]
x = self.dropout(x)
x = self.fc(x) # [batch_size, 1, 2]
x = x.view(-1, self.num_classes) # [batch_size, 2]
return x
3.4 FastText 网络结构
class FastText(nn.Module):
def __init__(self, args, pretrained_path):
super(FastText, self).__init__()
self.dim_embed = args.embedding_size
self.hidden_size = 256
self.n_vocab = pretrained_path.shape[0]
self.num_classes = args.num_class
self.pretrained = True
self.pretrained_path = pretrained_path
# 在文本分类任务中,0.6精度最高
self.dropout = 0.6
if self.pretrained:
self.embedding = nn.Embedding.from_pretrained(
self.pretrained_path, freeze=False)
else:
self.embedding = nn.Embedding(self.n_vocab, self.dim_embed)
self.dropout = nn.Dropout(self.dropout)
self.fc1 = nn.Linear(self.dim_embed, self.hidden_size)
self.fc2 = nn.Linear(self.hidden_size, self.num_classes)
def forward(self, x):
batch_size = x.shape[0]
x = self.embedding(x)
x = x.mean(dim=1)
x = self.dropout(x)
x = self.fc1(x)
x = F.relu(x)
x = self.fc2(x)
return x
3.5 TextRNN网络结构
class TextRNN(nn.Module):
def __init__(self, args, pretrained_path):
super(TextRNN, self).__init__()
self.pretrained = True
self.pretrained_path = pretrained_path
self.n_vocab = pretrained_path.shape[0]
self.dim_embed = args.embedding_size
self.hidden_size = 64
self.num_layers = 2
self.dropout = 0.4
self.num_classes = args.num_class
if self.pretrained:
self.embedding = nn.Embedding.from_pretrained(
self.pretrained_path, freeze=False)
else:
self.embedding = nn.Embedding(
self.n_vocab, self.dim_embed, padding_idx=self.n_vocab - 1)
self.lstm = nn.LSTM(self.dim_embed, self.hidden_size, self.num_layers,
bidirectional=True, batch_first=True, dropout=self.dropout)
self.fc = nn.Linear(self.hidden_size * 2, self.num_classes)
def forward(self, x):
x = self.embedding(x)
x, _ = self.lstm(x)
x = self.fc(x[:, -1, :])
return x
3.6 DPCNN 网络结构
class DPCNN(nn.Module):
def __init__(self, args, pretrained_path):
super(DPCNN, self).__init__()
self.dim_embed = args.embedding_size
self.num_filters = 256
self.kernel_size = 3
self.n_vocab = pretrained_path.shape[0]
self.num_classes = args.num_class
self.pretrained = True
self.pretrained_path = pretrained_path
if self.pretrained:
self.embedding = nn.Embedding.from_pretrained(self.pretrained_path, freeze=False)
else:
self.embedding = nn.Embedding(self.n_vocab, self.dim_embed)
self.conv_region = nn.Conv2d(1, self.num_filters, (self.kernel_size, self.dim_embed), stride=1)
self.conv = nn.Conv2d(self.num_filters, self.num_filters, (self.kernel_size, 1), stride=1)
self.max_pool = nn.MaxPool2d(kernel_size=(self.kernel_size, 1), stride=2)
self.padding1 = nn.ZeroPad2d((0, 0, 1, 1)) # top bottom
self.padding2 = nn.ZeroPad2d((0, 0, 0, 1)) # bottom
self.relu = nn.ReLU()
self.fc = nn.Linear(self.num_filters, self.num_classes)
def forward(self, x):
x = self.embedding(x) # [batch_size, max_len, dim_embed]
x = x.unsqueeze(1) # [batch_size, 1, max_len, dim_embed]
x = self.conv_region(x) # [batch_size, num_filters, max_len-kernel_size, 1]
x = self.padding1(x) # [batch_size, num_filters, max_len, 1]
x = self.relu(x)
x = self.conv(x) # [batch_size, num_filters, max_len-kernel_size, 1]
x = self.padding1(x) # [batch_size, num_filters, max_len, 1]
x = self.relu(x)
x = self.conv(x) # [batch_size, num_filters, max_len-kernel_size, 1]
while x.size()[2] > 2:
x = self._block(x) # [batch_size, num_filters, 1, 1]
x = x.squeeze() # [batch_size, num_filters]
x = self.fc(x) # [batch_size, num_classes]
return x
def _block(self, x):
x = self.padding2(x)
px = self.max_pool(x)
x = self.padding1(px)
x = self.relu(x)
x = self.conv(x)
x = self.padding1(x)
x = self.relu(x)
x = self.conv(x)
# Short Cut
x = x + px
return x
3.5 数据预处理
from nltk.stem import WordNetLemmatizer
import re
import nltk
import numpy as np
en_stop = set(nltk.corpus.stopwords.words('english'))
custom_stop_words = [
'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure',
'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.',
'al.', 'elsevier', 'pmc', 'czi', 'www'
]
for word in custom_stop_words:
en_stop.add(word)
def preprocess_text(document):
stemmer = WordNetLemmatizer()
document = str(document)
# 替换换行符
document = document.replace("\n", ' ')
document = document.replace("/'", '')
# 删除特殊字符
document = re.sub(r'\W', ' ', document)
# 删除所有单个字符
document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
# 从开头删除单个字符
document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)
# 用单个空格替换多个空格
document = re.sub(r'\s+', ' ', document, flags=re.I)
# 数字泛化:,所有大于9的数字都被hashs替换了。即成为# #,123变成# # #或15.80€变成# #,# #€。
document = re.sub('[0-9]{5,}', '#####', document)
document = re.sub('[0-9]{4}', '####', document)
document = re.sub('[0-9]{3}', '###', document)
document = re.sub('[0-9]{2}', '##', document)
# 转换为小写
document = document.lower()
# 词形还原
tokens = document.split()
tokens = [stemmer.lemmatize(word) for word in tokens]
# 去停用词
tokens = [word for word in tokens if word not in en_stop]
# 去超短文本
tokens = [word for word in tokens if len(word) > 3]
preprocessed_text = ' '.join(tokens)
return preprocessed_text
3.6 训练词向量
import nltk
from torch.utils import data
from tqdm import tqdm
# import tensorflow as tf
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from gensim.models import FastText
import torch
from glove import Glove
from glove import Corpus
from nltk import WordPunctTokenizer
def build_word2vec(args,train):
trainall_title = list(train['title'])
trainall_abstract = list(train['abstract'])
trainall_combine = np.empty_like(trainall_title)
for i in range(len(trainall_title)):
trainall_combine[i] = trainall_title[i] + ' <sep> ' + trainall_abstract[i]
# Prepare FastText Training Data
print('构造词向量:删除不需要字符...')
final_corpus = [preprocess_text(sentence) for sentence in trainall_combine if sentence.strip() != '']
print('构造词向量:分词...')
word_punctuation_tokenizer = WordPunctTokenizer()
word_tokenized_corpus = [word_punctuation_tokenizer.tokenize(sent) for sent in final_corpus]
# 选择单词编码工具
tokenizer = tf.keras.preprocessing.text.Tokenizer(lower=True)
tokenizer.fit_on_texts(word_tokenized_corpus)
# Train a FastText Model:
# embedding_size = 128
window_size = 40
min_word = 5
down_sampling = 1e-2
vector_tag = 'fasttext'
print('构造词典:训练词向量...')
# 选择Fasttext词向量
if vector_tag =='fasttext':
if os.path.exists(args.fastText_path):
ft_model = FastText.load(args.fastText_path)
else:
print("正在训练fasttext词向量")
ft_model = FastText(word_tokenized_corpus,
vector_size=args.embedding_size,
window=window_size,
min_count=min_word,
sample=down_sampling,
sg=1,
epochs=40)
#save your model as
print("word2vec")
wv_model=Word2Vec(word_tokenized_corpus, vector_size=args.embedding_size, min_count=5, epochs=50)
print("glove")
elif vector_tag =='glove':
if os.path.exists(args.glove_path):
corpus_model = Corpus()
corpus_model.fit(word_tokenized_corpus, window=5)
gl_model = Glove(no_components=args.embedding_size, learning_rate=0.05)
gl_model.fit(corpus_model.matrix, epochs=20,no_threads=1, verbose=True)
gl_model.add_dictionary(corpus_model.dictionary)
gl_model.save('glove.bin')
else:
gl_model = Glove.load(args.glove_path)
elif vector_tag =='word2vec':
if os.path.exists(args.word2vec_path):
ft_model = KeyedVectors.load_word2vec_format(args.word2vec_path, binary=True)
else:
print("正在训练word2vec词向量")
ft_model = Word2Vec(
word_tokenized_corpus, size=args.embedding_size, min_count=min_word, sg=1, iter=20)
ft_model.wv.save_word2vec_format(args.word2vec_path, binary=True)
# Extract fasttext learned embedding and put them in a numpy array 初始化空的嵌入矩阵
embedding_matrix_ft1 = np.random.random(
(len(tokenizer.word_index) + 1,args.embedding_size))
embedding_matrix_ft2 = np.random.random(
(len(tokenizer.word_index) + 1,args.embedding_size))
embedding_matrix_ft3 = np.random.random(
(len(tokenizer.word_index) + 1, args.embedding_size))
pas = 0
# 预训练矩阵
for word, i in tokenizer.word_index.items():
try:
#embedding_matrix_ft1[i] = ft_model.wv[word]
embedding_matrix_ft2[i] = wv_model.wv[word]
# embedding_matrix_ft3[i] = gl_model.word_vectors[model.dictionary[word]]
except:
pas += 1
# 如果需要,选择合并三种词向量,就去掉此处注释
# embedding_matrix_ft=np.concatenate([embedding_matrix_ft1, embedding_matrix_ft2, embedding_matrix_ft3], axis=1)
print(embedding_matrix_ft.shape)
# 只选择Fasttext的词向量矩阵
# return embedding_matrix_ft1, tokenizer
# 只选择word2vec的词向量矩阵
return embedding_matrix_ft2, tokenizer
# 只选择glove的词向量
# return embedding_matrix_ft3, tokenizer
# 选择合并三种词向量的矩阵
# return embedding_matrix_ft, tokenizer
3.7 封装训练集
class PaperData(data.Dataset):
def __init__(self, args, tokenizer,split='train'):
self.texts = []
self.labels = []
self.args = args
self.split = split
self.tokenizer = tokenizer
text_tokenizer = WordPunctTokenizer()
if self.split == "train":
print("训练集预处理...")
if not os.path.exists(args.process_trainset_path):
train = pd.read_csv(self.args.train_path)
word_tokenized_corpus = []
for text in tqdm(train['text']):
textp = preprocess_text(text)
tokentext = text_tokenizer.tokenize(textp)
word_tokenized_corpus.append(tokentext)
print('训练集预处理:分词...')
sequence_train = tokenizer.texts_to_sequences(word_tokenized_corpus)
sequence_train = tf.keras.preprocessing.sequence.pad_sequences(sequence_train, maxlen=args.embedding_size)
self.texts = sequence_train
self.labels = list(train['label'])
np.save(args.process_trainset_path,sequence_train)
np.save(args.process_trainlabel_path, np.array(self.labels))
else:
train_set = np.load(args.process_trainset_path)
train_label = np.load(args.process_trainlabel_path)
self.texts = list(train_set)
self.labels = list(train_label)
elif self.split == "test":
print("测试集预处理...")
if not os.path.exists(args.process_testset_path):
test = pd.read_csv(self.args.test_path, sep='\t')
word_tokenized_corpus = []
for text in tqdm(test['text']):
textp = preprocess_text(text)
tokentext = text_tokenizer.tokenize(textp)
word_tokenized_corpus.append(tokentext)
print('训练集预处理:分词...')
sequence_test = tokenizer.texts_to_sequences(word_tokenized_corpus)
sequence_test = tf.keras.preprocessing.sequence.pad_sequences(sequence_test, maxlen=args.embedding_size)
self.texts = sequence_test
self.labels = [-1 for i in range(len(test))]
np.save(args.process_testset_path, sequence_test)
else:
test_set = list(np.load(args.process_testset_path))
self.texts = test_set
self.labels = [-1 for i in range(len(test_set))]
else:
raise Exception("No file for split %s" % self.split)
assert len(self.texts) == len(self.labels)
def __len__(self):
return len(self.texts)
def __getitem__(self, index):
_text = self.texts[index]
_label = self.labels[index]
sample = {'text': _text, 'label': _label}
return self.transform(sample)
def transform(self, sample):
text = sample['text']
label = sample['label']
text = np.array(text)
label = np.array(label)
# 转为tensor向量
text = torch.from_numpy(text).to(torch.int64).to(DEVICE)
label = torch.from_numpy(label).to(torch.int64).to(DEVICE)
return {'text': text, 'label': label}
# 数据预处理
import tensorflow as tf
def make_data_loader(args):
train = pd.read_csv(args.train_all)
if not os.path.exists(args.train_path):
# 划分数据,合并title和abstract
label_id2cate = dict(enumerate(train.categories.unique()))
label_cate2id = {value: key for key, value in label_id2cate.items()}
# 保存label的类别,加快二次启动
with open(args.label_path, 'wb') as f:
pickle.dump(label_id2cate, f, pickle.HIGHEST_PROTOCOL)
train_x = pd.DataFrame(columns=['text', 'label'])
# 拼接title与abstract
train['text'] = train['title'] + ' ' + train['abstract']
train_x['label'] = train['categories'].map(label_cate2id)
train_x['text'] = train['text']
train_x.to_csv(args.train_path, index=False)
# 测试集预处理
test = pd.read_csv(args.testall_path, sep='\t')
test_x = pd.DataFrame(columns=['text'])
# 拼接title与abstract
test['text'] = test['title'] + ' ' + test['abstract']
test_x['text'] = test['text']
# 存储处理后的数据,加快二次加载时间
test_x.to_csv(args.test_path,index=False)
else:
# 非第一次启动,加载处理后的数据集
test_x = pd.read_csv(args.test_path)
train_x = pd.read_csv(args.train_path)
with open(args.label_path, 'rb') as f:
label_id2cate = pickle.load(f)
# 构建词向量和返回预训练矩阵
embedding_matrix_ft, tokenizer = build_word2vec(args, train)
# 封装训练集
train_set = PaperData(args, tokenizer=tokenizer, split='train')
# 封装测试集
test_set = PaperData(args, tokenizer=tokenizer, split='test')
# train_set不封装为Dataloader的原因是,后面K折交叉验证的时候需要data.Dataset的格式
test_loader = DataLoader(test_set, batch_size=args.batch_size, shuffle=False)
return embedding_matrix_ft, train_set, test_loader, label_id2cate
3.8 训练模型
3.8.1 初始化网络权重
# 初始化网络权重
def init_network(model, method='kaiming', exclude='embedding', seed=123): # method='kaiming'
for name, w in model.named_parameters():
if exclude not in name:
if 'weight' in name:
if method == 'xavier':
nn.init.xavier_normal_(w)
elif method == 'kaiming':
nn.init.kaiming_normal_(w)
else:
nn.init.normal_(w)
elif 'bias' in name:
nn.init.constant_(w, 0)
3.8.2 对抗训练
#对抗训练
## 方法一:PGD
class PGD():
def __init__(self, model):
self.model = model
self.emb_backup = {}
self.grad_backup = {}
def attack(self, epsilon=1., alpha=0.3, emb_name='emb', is_first_attack=1):
# emb_name这个参数要换成你模型中embedding的参数名
for name, param in self.model.named_parameters():
if param.requires_grad and emb_name in name:
if is_first_attack==0:
self.emb_backup[name] = param.data.clone()
norm = torch.norm(param.grad)
if norm != 0 and not torch.isnan(norm):
r_at = alpha * param.grad / norm
param.data.add_(r_at)
param.data = self.project(self.emb_backup,name, param.data, epsilon)
def restore(self, emb_name='emb'):
# emb_name这个参数要换成你模型中embedding的参数名
for name, param in self.model.named_parameters():
if param.requires_grad and emb_name in name:
if len(self.emb_backup)==0:
continue
assert name in self.emb_backup
param.data = self.emb_backup[name]
self.emb_backup = {}
def project(self,t, param_name, param_data, epsilon):
#assert param_name in self.emb_backup
if param_name not in self.emb_backup:
return param_data
r = param_data - t[param_name]
if torch.norm(r) > epsilon:
r = epsilon * r / torch.norm(r)
return t[param_name] + r
def backup_grad(self):
for name, param in self.model.named_parameters():
if param.requires_grad:
self.grad_backup[name] = param.grad.clone()
def restore_grad(self):
for name, param in self.model.named_parameters():
if len(self.grad_backup)==0:
continue
if param.requires_grad:
param.grad = self.grad_backup[name]
## 方法二:FGM对抗训练
class FGM():
def __init__(self, model):
self.model = model
self.backup = {}
print(model.named_parameters())
def attack(self, epsilon=1., emb_name='embedding'):
# emb_name这个参数要换成你模型中embedding的参数名
for name, param in self.model.named_parameters():
if param.requires_grad and emb_name in name:
self.backup[name] = param.data.clone()
norm = torch.norm(param.grad)
if norm != 0 and not torch.isnan(norm):
r_at = epsilon * param.grad / norm
param.data.add_(r_at)
def restore(self, emb_name='embedding'):
# emb_name这个参数要换成你模型中embedding的参数名
for name, param in self.model.named_parameters():
if param.requires_grad and emb_name in name:
assert name in self.backup
param.data = self.backup[name]
self.backup = {}
3.8.3 K 折数据划分
# K折数据划分
def load_data_kfold(dataset,batch_size, k, n):
print("第{}折正在划分数据集".format(n+1))
l = len(dataset)
print(l)
shuffle_dataset = True
random_seed = 42 # fixed random seed
indices = list(range(l))
if shuffle_dataset:
np.random.seed(random_seed)
np.random.shuffle(indices) # shuffle
# Collect indexes of samples for validation set.
val_indices = indices[int(l / k) * n:int(l / k) * (n + 1)]
train_indices = list(set(indices).difference(set(val_indices)))
train_sampler = data.SubsetRandomSampler(train_indices) # build Sampler
valid_sampler = data.SubsetRandomSampler(val_indices)
train_loader = data.DataLoader(dataset, batch_size=batch_size,
sampler=train_sampler) # build dataloader for train set
validation_loader = data.DataLoader(dataset, batch_size=batch_size,
sampler=valid_sampler) # build dataloader for validate set
print("划分完成")
return train_loader, validation_loader
3.8.4 训练过程
# 训练,包括了对抗训练,但在该学术论文分类任务中,并没有取得较好的效果,PGD效果更好一些
def train(args, model, train_set, test_iter, label_id2cate):
# 10折交叉验证
k_fold = 10
predict_all = np.zeros([10000,39])#存储测试集的 预测结果
for n in range(k_fold):
# 交叉验证
train_iter, val_iter = load_data_kfold(train_set, args.batch_size, k_fold, n)
model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)
total_batch = 0
dev_best_acc = 0
for epoch in range(args.num_epochs):
# FGM对抗训练
# fgm=FGM(model)
# PGD对抗训练
# pgd=PGD(model)
print('Fold=[{}/{}] Epoch [{}/{}]'.format(n+1,k_fold,epoch + 1, args.num_epochs))
for i, data in enumerate(train_iter):
text = data["text"]
label = data["label"]
outputs = model(text)
model.zero_grad()
loss = F.cross_entropy(outputs, label)
loss.backward()
# FGM对抗训练
# fgm.attack() # 在embedding上添加对抗扰动
# loss_adv = F.cross_entropy(model(text), label)
# loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度
# fgm.restore() # 恢复embedding参数
# PGD对抗训练
# pgd.backup_grad()
# for t in range(K):
# pgd.attack(is_first_attack=t) #embedding上添加对抗扰动, first attack时备份param.data
# if t != K-1:
# model.zero_grad()
# else:
# pgd.restore_grad()
# loss_adv = F.cross_entropy(model(text), label)
# loss_adv.backward() # 反向传播,并在正常的grad基础上,累加对抗训练的梯度
# pgd.restore() # 恢复embedding参数
optimizer.step()
if total_batch % 10 == 0:
y_true = label.data.cpu()
y_pred = torch.max(outputs.data, 1)[1].cpu()
train_acc = metrics.accuracy_score(y_true, y_pred)
dev_acc, dev_loss = evaluate(model, val_iter)
if dev_acc > dev_best_acc:
dev_best_acc = dev_acc
torch.save(model.state_dict(), args.save_path +'/' + args.model + '.ckpt')
# print("saved model, best acc on dev: %.4f" % dev_acc)
msg = 'Iter:{0} train_loss: {1:.3} train_acc: {2:.2%} val_oss: {3:.2} val_cc: {4:.3%}'
print(msg.format(total_batch, loss.item(),
train_acc, dev_loss, dev_acc))
model.train()
total_batch += 1
result = []
with torch.no_grad():
for i, data in enumerate(test_iter):
text = data["text"]
outputs = model(text)
y_pred = outputs.data.cpu().numpy()
result.extend(y_pred)
predict_all += np.array(result)
avg_predict = predict_all/k_fold
np.save("FastText.npy", predict_all)
predict_kfold(avg_predict, args.model, label_id2cate)
# 验证
def evaluate(model, val_iter):
model.eval()
loss_total = 0
y_preds = []
y_trues = []
with torch.no_grad():
for data in val_iter:
text = data["text"]
label = data["label"]
outputs = model(text)
loss = F.cross_entropy(outputs, label)
loss_total += loss
y_true = label.tolist()
y_pred = torch.max(outputs.data, 1)[1].cpu().tolist()
y_trues.extend(y_true)
y_preds.extend(y_pred)
acc = metrics.accuracy_score(y_trues, y_preds)
return acc, loss_total/len(val_iter)
# 生成提交文件
def predict_kfold(avg_predict, model_name, label_id2cate):
result = np.argmax(avg_predict, axis=1)
sub = pd.read_csv('./data/sample_submit.csv')
sub['categories'] = list(result)
sub['categories'] = sub['categories'].map(label_id2cate)
sub.to_csv('submit_{}.csv'.format(model_name), index=False)
3.9 主函数
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
torch.backends.cudnn.deterministic = True # 保证每次结果一样
# 加载数据集
embedding_matrix_ft, train_set, test_iter, label_id2cate = make_data_loader(
args)
pretrained_path = torch.FloatTensor(embedding_matrix_ft)
if args.model == "TextCNN": # 0.8076(epoch=8)
model = TextCNN(args, pretrained_path)
elif args.model == "FastText": # 0.8070 (epoch=20)
model = FastText(args, pretrained_path)
elif args.model == "TextRNN": # 0.9807(20 epoch), 0.9884(30 epoch)
model = TextRNN(args, pretrained_path)
elif args.model == "DPCNN":
model = DPCNN(args, pretrained_path)
model.to(DEVICE)
# 开始训练
train(args, model, train_set, test_iter, label_id2cate)