编辑
一、Introduction
数据集下载地址 Sentiment Analysis (stanford.edu)
编辑
斯坦佛官方描述IMDB非常详细:用于二分类情感分类的数据集,包含25k训练集和25k测试集。
下载解压数据集后,得到test和train文件夹, 打开train文件夹,neg和pos为已标注情感样本,unsup为未标注情感样本 。我们最终需要使用的就是train和test中neg和pos数据
编辑
二、Data Processing
将存储在一个个txt中的样本数据合并成一个csv数据集文件
get_data():获取原始数据
shuffle_process():数据打乱
save_process():数据去除标点符号,并保存为datasets.csv
import csv import os import re import numpy as np import pandas as pd import torch from Financial_News_Sentiment_Analysis.tt import tokenizer def get_data(): pos1, pos2 = os.listdir('../../dataset/aclImdb/test/pos'), os.listdir('../../dataset/aclImdb/train/pos') neg1, neg2 = os.listdir('../../dataset/aclImdb/test/neg'), os.listdir('../../dataset/aclImdb/train/neg') pos_all, neg_all = [], [] for p1, n1 in zip(pos1, neg1): with open('../../dataset/aclImdb/test/pos/' + p1, encoding='utf8') as f: pos_all.append(f.read()) with open('../../dataset/aclImdb/test/neg/' + n1, encoding='utf8') as f: neg_all.append(f.read()) for p2, n2 in zip(pos2, neg2): with open('../../dataset/aclImdb/train/pos/' + p2, encoding='utf8') as f: pos_all.append(f.read()) with open('../../dataset/aclImdb/train/neg/' + n2, encoding='utf8') as f: neg_all.append(f.read()) datasets = np.array(pos_all + neg_all) labels = np.array([1] * 25000 + [0] * 25000) return datasets, labels def shuffle_process(): sentences, labels = get_data() # Shuffle shuffle_indexs = np.random.permutation(len(sentences)) datasets = sentences[shuffle_indexs] labels = labels[shuffle_indexs] return datasets,labels def save_process(): datasets, labels = shuffle_process() sentences = [] punc = '[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\n。!,]' for sen in datasets: sen = sen.replace('\n', '') sen = sen.replace('<br /><br />', ' ') sen = re.sub(punc, '', sen) sentences.append(sen) # Save df = pd.DataFrame({'labels': labels, 'sentences': sentences}) df.to_csv("datasets.csv", index=False) if __name__ == '__main__': model = save_process()
处理后的效果
编辑