1. 本文代码所以示例用的数据集
原始数据是https://storage.googleapis.com/cluebenchmark/tasks/iflytek_public.zip(链接是从CLUEbenchmark/CLUE: 中文语言理解测评基准 Chinese Language Understanding Evaluation Benchmark: datasets, baselines, pre-trained models, corpus and leaderboard得到的)
一个长文本分类数据集。(训练集有12133条样本,验证集有2599条样本,测试集有282条,训练集平均文本长度为289、最长文本长度为4282)
解压后得到train.json/dev.json/test.json三个文件,由于只有训练集和验证集有标签,所以我的策略是在训练集上训练10000个epoch的线性分类器,计算验证集上的指标,来做比较。
具体对数据的处理工作需要依据文本表征方式做改变。因此放在后面的分节中进行。
2. 从词表征得到句子表征
2.1.1 Transformer预训练模型+CLS
本文以Bert为例。DistillBert、RoBerta、Longformer等类似预训练模型也差不多。
预训练模型checkpoint下载自https://huggingface.co/bert-base-chinese
BERT返回值中pooler_output键就是[CLS] token的表征,用来代表全句。
特征维度为768。
import json from tqdm import tqdm from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score import torch import torch.nn as nn from torch.utils.data import Dataset,TensorDataset,DataLoader from transformers import AutoModel,AutoTokenizer gpu_device='cuda:0' epoch_num=10000 embedding_batch_size=256 feature_dim=768 train_batch_size=2048 inference_batch_size=4096 #文本表征部分 tokenizer=AutoTokenizer.from_pretrained("/data/pretrained_model/bert-base-chinese") class TextInitializeDataset(Dataset): """初始化数据集为Dataset,每个样本是一条字符串文本""" def __init__(self,mode='train') -> None: data=[json.loads(x) for x in open('/data/other_data/iflytek_public/'+mode+'.json').readlines()] self.text=[x['sentence'] for x in data] def __getitem__(self, index): return self.text[index] def __len__(self): return len(self.text) def collate_fn(batch): pt_batch=tokenizer(batch,padding=True,truncation=True,max_length=512,return_tensors='pt') return {'input_ids':pt_batch['input_ids'],'token_type_ids':pt_batch['token_type_ids'],'attention_mask':pt_batch['attention_mask']} #训练集 train_dataset=TextInitializeDataset() train_dataloader=DataLoader(train_dataset,batch_size=embedding_batch_size,shuffle=False,collate_fn=collate_fn) #验证集 dev_dataset=TextInitializeDataset(mode='dev') dev_dataloader=DataLoader(dev_dataset,batch_size=embedding_batch_size,shuffle=False,collate_fn=collate_fn) #文本表征 bert_encoder=AutoModel.from_pretrained("/data/pretrained_model/bert-base-chinese") bert_encoder.to(gpu_device) with torch.no_grad(): bert_encoder.eval() #训练集 train_embedding=torch.zeros((len(train_dataset)),feature_dim) matrix_count=-1 for batch in tqdm(train_dataloader,desc='计算训练集文本表征'): matrix_count+=1 outputs=bert_encoder(input_ids=batch['input_ids'].to(gpu_device),token_type_ids=batch['token_type_ids'].to(gpu_device), attention_mask=batch['attention_mask'].to(gpu_device))['pooler_output'] train_embedding[matrix_count*embedding_batch_size:matrix_count*embedding_batch_size+batch['input_ids'].size()[0]]=outputs #验证集 dev_embedding=torch.zeros((len(dev_dataset)),feature_dim) matrix_count=-1 for batch in tqdm(dev_dataloader,desc='计算验证集文本表征'): matrix_count+=1 outputs=bert_encoder(input_ids=batch['input_ids'].to(gpu_device),token_type_ids=batch['token_type_ids'].to(gpu_device), attention_mask=batch['attention_mask'].to(gpu_device))['pooler_output'] dev_embedding[matrix_count*embedding_batch_size:matrix_count*embedding_batch_size+batch['input_ids'].size()[0]]=outputs
2.1.2 Transformer预训练模型+平均值池化
其他事项如2.1.1节。
之所以专门写了个改用平均值池化的,是因为广泛认为[CLS] token表征的表示能力很差,还不如直接用平均值池化。(在后面的实验结果里也可以看出确实如此)
import json from tqdm import tqdm from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score import torch import torch.nn as nn from torch.utils.data import Dataset,TensorDataset,DataLoader from transformers import AutoModel,AutoTokenizer gpu_device='cuda:0' epoch_num=10000 embedding_batch_size=256 feature_dim=768 train_batch_size=2048 inference_batch_size=4096 #文本表征部分 tokenizer=AutoTokenizer.from_pretrained("/data/pretrained_model/bert-base-chinese") class TextInitializeDataset(Dataset): """初始化数据集为Dataset,每个样本是一条字符串文本""" def __init__(self,mode='train') -> None: data=[json.loads(x) for x in open('/data/other_data/iflytek_public/'+mode+'.json').readlines()] self.text=[x['sentence'] for x in data] def __getitem__(self, index): return self.text[index] def __len__(self): return len(self.text) def collate_fn(batch): pt_batch=tokenizer(batch,padding=True,truncation=True,max_length=512,return_tensors='pt') return {'input_ids':pt_batch['input_ids'],'token_type_ids':pt_batch['token_type_ids'],'attention_mask':pt_batch['attention_mask']} #训练集 train_dataset=TextInitializeDataset() train_dataloader=DataLoader(train_dataset,batch_size=embedding_batch_size,shuffle=False,collate_fn=collate_fn) #验证集 dev_dataset=TextInitializeDataset(mode='dev') dev_dataloader=DataLoader(dev_dataset,batch_size=embedding_batch_size,shuffle=False,collate_fn=collate_fn) #文本表征 bert_encoder=AutoModel.from_pretrained("/data/pretrained_model/bert-base-chinese") bert_encoder.to(gpu_device) with torch.no_grad(): bert_encoder.eval() #训练集 train_embedding=torch.zeros((len(train_dataset)),feature_dim) matrix_count=-1 for batch in tqdm(train_dataloader,desc='计算训练集文本表征'): matrix_count+=1 for key in batch: batch[key]=batch[key].to(gpu_device) outputs=bert_encoder(**batch)['last_hidden_state'] outputs[batch['attention_mask']==0]=0 outputs=outputs.sum(axis=1)/batch['attention_mask'].sum(axis=-1).unsqueeze(-1) train_embedding[matrix_count*embedding_batch_size:matrix_count*embedding_batch_size+batch['input_ids'].size()[0]]=outputs #验证集 dev_embedding=torch.zeros((len(dev_dataset)),feature_dim) matrix_count=-1 for batch in tqdm(dev_dataloader,desc='计算验证集文本表征'): matrix_count+=1 for key in batch: batch[key]=batch[key].to(gpu_device) outputs=bert_encoder(**batch)['last_hidden_state'] outputs[batch['attention_mask']==0]=0 outputs=outputs.sum(axis=1)/batch['attention_mask'].sum(axis=-1).unsqueeze(-1) dev_embedding[matrix_count*embedding_batch_size:matrix_count*embedding_batch_size+batch['input_ids'].size()[0]]=outputs
2.1.3 Transformer预训练模型+最大值池化
import json from tqdm import tqdm from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score import torch import torch.nn as nn from torch.utils.data import Dataset,TensorDataset,DataLoader from transformers import AutoModel,AutoTokenizer gpu_device='cuda:0' epoch_num=10000 embedding_batch_size=256 feature_dim=768 train_batch_size=2048 inference_batch_size=4096 #文本表征部分 tokenizer=AutoTokenizer.from_pretrained("/data/pretrained_model/bert-base-chinese") class TextInitializeDataset(Dataset): """初始化数据集为Dataset,每个样本是一条字符串文本""" def __init__(self,mode='train') -> None: data=[json.loads(x) for x in open('/data/other_data/iflytek_public/'+mode+'.json').readlines()] self.text=[x['sentence'] for x in data] def __getitem__(self, index): return self.text[index] def __len__(self): return len(self.text) def collate_fn(batch): pt_batch=tokenizer(batch,padding=True,truncation=True,max_length=512,return_tensors='pt') return {'input_ids':pt_batch['input_ids'],'token_type_ids':pt_batch['token_type_ids'],'attention_mask':pt_batch['attention_mask']} #训练集 train_dataset=TextInitializeDataset() train_dataloader=DataLoader(train_dataset,batch_size=embedding_batch_size,shuffle=False,collate_fn=collate_fn) #验证集 dev_dataset=TextInitializeDataset(mode='dev') dev_dataloader=DataLoader(dev_dataset,batch_size=embedding_batch_size,shuffle=False,collate_fn=collate_fn) #文本表征 bert_encoder=AutoModel.from_pretrained("/data/pretrained_model/bert-base-chinese") bert_encoder.to(gpu_device) with torch.no_grad(): bert_encoder.eval() #训练集 train_embedding=torch.zeros((len(train_dataset)),feature_dim) matrix_count=-1 for batch in tqdm(train_dataloader,desc='计算训练集文本表征'): matrix_count+=1 for key in batch: batch[key]=batch[key].to(gpu_device) outputs=bert_encoder(**batch)['last_hidden_state'] outputs[batch['attention_mask']==0]=outputs.min() outputs=torch.max(outputs,dim=1).values train_embedding[matrix_count*embedding_batch_size:matrix_count*embedding_batch_size+batch['input_ids'].size()[0]]=outputs #验证集 dev_embedding=torch.zeros((len(dev_dataset)),feature_dim) matrix_count=-1 for batch in tqdm(dev_dataloader,desc='计算验证集文本表征'): matrix_count+=1 for key in batch: batch[key]=batch[key].to(gpu_device) outputs=bert_encoder(**batch)['last_hidden_state'] outputs[batch['attention_mask']==0]=outputs.min() outputs=torch.max(outputs,dim=1).values dev_embedding[matrix_count*embedding_batch_size:matrix_count*embedding_batch_size+batch['input_ids'].size()[0]]=outputs
2.1.4 w2v词向量+平均值池化
本文用的是预训练的300维稠密向量,模型下载源是https://pan.baidu.com/s/14JP1gD7hcmsWdSpTvA3vKA(链接来自https://github.com/Embedding/Chinese-Word-Vectors/blob/master/README_zh.md,就最大那个)
加载词向量的过程直接用了for循环,可以优化,但是因为时间比较短所以也没有再加速了。
这个文本表征我是用torch.nn.Embedding来实现批量转换的,因为我感觉这样理论上应该比用for循环语句对每个样本进行转换要快一点。但是我也没有真的用for循环写过,所以也没有对比实验。
import json,jieba from tqdm import tqdm import numpy as np from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score import torch import torch.nn as nn from torch.utils.data import TensorDataset,DataLoader gpu_device='cuda:0' epoch_num=10000 max_sentence_length=512 #限制句子最长token数为512(这个数字是随手定的) embedding_batch_size=1024 train_batch_size=2048 inference_batch_size=4096 #文本表征部分 #将词嵌入加载到内存中 embedding_file='/data/other_data/merge_sgns_bigram_char300.txt' word2id={} embedding_list=[] embedding_list.append([0 for _ in range(300)]) #这个是pad的向量 with open(embedding_file) as f: f_content=f.readlines() #第一行是嵌入的总词数和维度 #从第二行开始,第一个空格之前的是词,后面的是向量(用空格隔开) pair=f_content[0].split(' ') feature_dim=int(pair[1]) for sentence_index in tqdm(range(1,len(f_content))): sentence=f_content[sentence_index] first_space_index=sentence.find(' ') word2id[sentence[:first_space_index]]=sentence_index embedding_list.append([float(x) for x in sentence[first_space_index:].split()]) #由于词向量中没有引入UNK,因此参考https://github.com/Embedding/Chinese-Word-Vectors/issues/74 用所有嵌入的平均值作为这一项值 word2id['UNK']=len(f_content) #0是pad的索引,所以已经有全的len(f_content)个词向量在了 embedding_weight=np.array(embedding_list) unk_embedding=np.mean(embedding_weight,axis=0) embedding_weight=np.concatenate((embedding_weight,np.expand_dims(unk_embedding,0)),axis=0) print(embedding_weight.shape) embedding=nn.Embedding(embedding_weight.shape[0],feature_dim) embedding.weight.data.copy_(torch.from_numpy(embedding_weight)) embedding.weight.requires_grad=False embedding.to(gpu_device) def pad_list(v:list,max_length:int): """ v是一个由未经pad的数值向量组成的列表 返回值是pad后的向量和mask """ if len(v)>=max_length: return (v[:max_length],[1 for _ in range(max_length)]) else: padded_length=max_length-len(v) m=[1 for _ in range(len(v))]+[0 for _ in range(padded_length)] v.extend([0 for _ in range(padded_length)]) return (v,m) def collate_fn(batch): jiebaed_text=[jieba.lcut(sentence) for sentence in batch] #每个元素是一个句子的列表,由句子中的词语组成 mapped_text=[[word2id[word] if word in word2id else word2id['UNK'] for word in sentence] for sentence in jiebaed_text] #每个元素是一个句子的列表,由词语对应的索引组成 max_len=min(max_sentence_length,max([len(x) for x in mapped_text])) #padding到的长度,限长 padded_list=[pad_list(v,max_len) for v in mapped_text] numerical_text=torch.tensor([x[0] for x in padded_list]) mask=torch.tensor([x[1] for x in padded_list]) return (numerical_text,mask) #训练集 train_data=[json.loads(x) for x in open('/data/other_data/iflytek_public/train.json').readlines()] train_text=[x['sentence'] for x in train_data] train_dataloader=DataLoader(train_text,embedding_batch_size,shuffle=False,collate_fn=collate_fn) train_embedding=torch.zeros((len(train_text)),feature_dim) matrix_count=-1 for batch in tqdm(train_dataloader): matrix_count+=1 outputs=embedding(batch[0].to(gpu_device)) outputs=outputs.sum(axis=1)/batch[1].to(gpu_device).sum(axis=1).unsqueeze(1) #我显式把mask部分的嵌入置0了 train_embedding[matrix_count*embedding_batch_size:matrix_count*embedding_batch_size+batch[0].size()[0]]=outputs #验证集 dev_data=[json.loads(x) for x in open('/data/other_data/iflytek_public/dev.json').readlines()] dev_text=[x['sentence'] for x in dev_data] dev_dataloader=DataLoader(dev_text,embedding_batch_size,shuffle=False,collate_fn=collate_fn) dev_embedding=torch.zeros((len(dev_text)),feature_dim) matrix_count=-1 for batch in tqdm(dev_dataloader): matrix_count+=1 outputs=embedding(batch[0].to(gpu_device)) outputs=outputs.sum(axis=1)/batch[1].to(gpu_device).sum(axis=1).unsqueeze(1) #我显式把mask部分的嵌入置0了 dev_embedding[matrix_count*embedding_batch_size:matrix_count*embedding_batch_size+batch[0].size()[0]]=outputs
3. 句子表征
4. 分类模型
4.1 代码
(这个嵌入矩阵本来在CPU上,每次都需要手动转到GPU上,如果矩阵很小的话可以直接放到GPU上,这一点可以优化。我是因为速度还算比较快所以没有优化了)
#建立线性分类器 class LinearClassifier(nn.Module): def __init__(self,input_dim,output_dim=119): super(LinearClassifier,self).__init__() self.dropout=nn.Dropout(0.1) self.classifier=nn.Linear(input_dim,output_dim) def forward(self,x): x=self.dropout(x) x=self.classifier(x) return x model=LinearClassifier(feature_dim) model.to(gpu_device) optimizer=torch.optim.Adam(params=model.parameters(),lr=1e-4) loss_func=nn.CrossEntropyLoss() #训练集 train_labels=torch.tensor([int(json.loads(x)['label']) for x in open('/data/other_data/iflytek_public/train.json').readlines()]) train_dataloader=DataLoader(TensorDataset(train_embedding,train_labels),batch_size=train_batch_size,shuffle=True) for epoch in tqdm(range(epoch_num),desc='训练分类模型'): for batch in train_dataloader: model.train() optimizer.zero_grad() outputs=model(batch[0].to(gpu_device)) train_loss=loss_func(outputs,batch[1].to(gpu_device)) train_loss.backward() optimizer.step() #验证集 dev_label=[int(json.loads(x)['label']) for x in open('/data/other_data/iflytek_public/dev.json').readlines()] dev_predicts=[] dev_dataloader=DataLoader(dev_embedding,batch_size=inference_batch_size,shuffle=False) with torch.no_grad(): for batch in dev_dataloader: model.eval() outputs=model(batch.to(gpu_device)) dev_predicts.extend([i.item() for i in torch.argmax(outputs,1)]) #准确率 macro-precison macro-recall macro-F1 print(accuracy_score(dev_label,dev_predicts)) print(precision_score(dev_label,dev_predicts,average='macro')) print(recall_score(dev_label,dev_predicts,average='macro')) print(f1_score(dev_label,dev_predicts,average='macro'))
4.2 结果
(×100,小数点保留后两位)