1 引言
代码下载https://github.com/823316627bandeng/TIANCHI-2021-AI-Compition
本人自己的见解,如果有任何疑问和问题,欢迎私信讨论
目前选取 2 个特征:
- 原本 text部分的字符,去除前两个高频字符,id-tdf提取500维特征
- 句子长度int
2 步骤
2.1 导入工具包
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
2.1 删除句子中高频词后Text作为一个特征
# 删除句子中高频词
def delete_highfrequency_word(data):
text_Word_frequency = []
from collections import Counter
datalen = len(data)
for i in range(0,datalen):
one_lines = ''.join(list(data['text'][i][1:-1]))
textls = one_lines.strip().split(" ")
all_word_count = Counter(textls)
all_word_count = sorted(all_word_count.items(), key=lambda d:d[1], reverse = True)
# 删除693和328两个字符,根据数据分析阶段的词频统计,这两个字符可能是标点符号
dict_word_count = dict(all_word_count)
if dict_word_count.get('693') !=None:
del dict_word_count['693']
if dict_word_count.get('328') !=None:
del dict_word_count['328']
string_top10_high_frequency_word = list(dict_word_count.keys())
if '' in string_top10_high_frequency_word:
string_top10_high_frequency_word.remove('')
list_to_str = " ".join(string_top10_high_frequency_word)
text_Word_frequency.append(list_to_str)
return text_Word_frequency
2.2 统计句子长度作为第二个特征
# 统计每行的句子长度并添加到train_data中组成新的一个特征
def count_text_len(data):
text_len =[]
datalen = len(data)
for i in range(0,datalen):
one_lines = ''.join(list(data['text'][i]))
len_text = one_lines.strip().split(" ")
text_len.append(len(len_text))
return text_len
2.3 label缺失值处理
# label缺失值处理,为空的label设置为字符串17,代表无异常
# 缺失值填充,空label的填充去17
def label_fill(data):
new_code_label =[]
datalen = len(data)
for i in range(0,datalen):
one_lines = ''.join(list(data['label'][i][1:])).strip()
if one_lines =='':#空label
new_code_label.append('17')
else:
new_code_label.append(one_lines)
return new_code_label
2.4 ID-TDF算法提取text一列数据的特征
见main函数中
2.5 Label编码
举例:
把2,17编码成[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]
# label编码,一个编码占一个表格,编码后一个标签有18列
def label_encode(train_data):
train_data_label = pd.DataFrame(columns=['label'])
train_data_label['label'] = label_fill(train_data)
mlb = MultiLabelBinarizer(classes=['0','1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17'])#17表示无异常
Ylist = []
all_train_data_label = pd.DataFrame(columns=['f0','f1', 'f2', 'f3','f4','f5','f6','f7', 'f8', 'f9','f10','f11','f12','f13', 'f14', 'f15','f16','f17'],index=[])
indexsize = 0
for i in range(len(train_data_label)):
templist = train_data_label['label'][i].split()
# 转二值化编码
label_code_list = list(mlb.fit_transform([templist])[0])
# 写入DataFrame
all_train_data_label.loc[indexsize] =label_code_list
indexsize = indexsize + 1
return all_train_data_label
3 Main函数
import pandas as pd
import numpy as np
from utils import *
from sklearn.feature_extraction.text import TfidfVectorizer
if __name__ =="__main__":
test_data = pd.read_csv('./data/track1_round1_testA_20210222.csv',header=None, names=['id', 'text'])
train_data = pd.read_csv('./data/track1_round1_train_20210222.csv',header=None, names=['id', 'text','label'])
# 删除高频字符,去除首尾的”|“
print("正在去除高频字符...")
train_aug_data = pd.DataFrame()
train_aug_data['label'] = train_data['label']
# 删除高频字符:693 328 可能是标点符号
train_aug_data['text']= delete_highfrequency_word(train_data)
test_data['text'] = delete_highfrequency_word(test_data)
# 合并,准备提取tfidf特征
print(train_aug_data.shape, test_data.shape)
df = pd.concat([train_aug_data, test_data], axis=0, ignore_index=True)
print("正在提取Text特征...")
tfidf = TfidfVectorizer(ngram_range=(1, 2),max_features=500)
tfidf_feature = tfidf.fit_transform(df['text'])
svd_feature = tfidf_feature.toarray()
# 拆分出训练集和测试集
train_data_sample = svd_feature[:-3000]
test_data_sample = svd_feature[-3000:]
# 统计句子长度,作为一个特征
print("训练集-正在计算句子长度...")
train_sample = pd.DataFrame(train_data_sample)
train_sample.columns = ['tfidf_'+str(i) for i in train_sample.columns]
train_sample['textlen'] = count_text_len(train_aug_data)
print("测试集-正在统计句子长度...")
test_sample = pd.DataFrame(test_data_sample)
test_sample.columns = ['tfidf_'+str(i) for i in test_sample.columns]
test_sample['textlen'] = count_text_len(test_data)
#label编码
print("正在label编码")
label = label_encode(train_aug_data)
label.to_csv('./data/label.csv',index =False)
train_sample.to_csv('./data/train_sample.csv',index =False)
test_sample.to_csv('./data/test_sample.csv',index =False)
print()
4 数据格式展示
4.1 训练样本集
train_data_sample = pd.read_csv('./data/train_data_sample.csv')
print(train_data_sample.info())
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 82 columns):
Column Non-Null Count Dtype
0 Unnamed: 0 10000 non-null int64
1 tfidf_1 10000 non-null float64
…
500 tfidf_499 10000 non-null float64
501 textlen 10000 non-null int64
4.2 训练label集
train_data_label= pd.read_csv('./data/all_train_data_label.csv')
print(train_data_label.info())
4.4 测试样本集
test_data_sample = pd.read_csv('./data/all_test_data_sample.csv')
print(test_data_sample.info())
RangeIndex: 3000 entries, 0 to 2999
Data columns (total 82 columns):
Column Non-Null Count Dtype
0 Unnamed: 0 3000 non-null int64
1 tfidf_0 3000 non-null float64
500 tfidf_499 3000 non-null float64
…
501 textlen 3000 non-null int64
5 数据增强(失败)
尝试过的方法
- shuffle每个句子中的字符
- 随机删除某个句子中的一半字符
- 为了处理样本不均衡,对类别少的样本进行复制,多样本太多的进行删除
注意:以下的数据增强在机器学习算法中,在线下训练提高了准确率,但是在线上准确率反而下降,我认为是训练集和测试集的分布不一致导致。以下实现代码仅仅做记录
5.1 shuffle和dropout
在去除text中高频词的同时,打乱字符顺序并随机删除一半字符
# 去除text中高频词和数据增强
def delete_highfrequency_word_augmentation(data):
text_Word_frequency = []
from collections import Counter
datalen = len(data)
for i in range(0,datalen):
one_lines = ''.join(list(data['text'][i][1:-1]))
textls = one_lines.strip().split(" ")
# 数据增强
# 打乱字符顺序
textls = shuffle(textls)
# 随机删除字符
randn = random.random()
if randn >0.5 :
textls = dropout(textls)
all_word_count = Counter(textls)
all_word_count = sorted(all_word_count.items(), key=lambda d:d[1], reverse = True)
# 删除693和328两个字符,根据数据分析阶段的词频统计,这两个字符可能是标点符号
dict_word_count = dict(all_word_count)
if dict_word_count.get('693') !=None:
del dict_word_count['693']
if dict_word_count.get('328') !=None:
del dict_word_count['328']
string_top10_high_frequency_word = list(dict_word_count.keys())
if '' in string_top10_high_frequency_word:
string_top10_high_frequency_word.remove('')
list_to_str = " ".join(string_top10_high_frequency_word)
text_Word_frequency.append(list_to_str)
return text_Word_frequency
def shuffle(textls):
shuffle_text = np.random.permutation(textls)
return shuffle_text
def dropout(textls, p=0.5):
# random delete some text
length = len(textls)
indexs = random.sample(range(length),int(p*length))
newls = []
for i, val in enumerate(textls):
if i not in indexs:
newls.append(val)
return newls
5.2
对label包含3\5\610\12\13\16\14\9的样本进行复制一定的量,并删除包含类别15的最多样本的量
def data_expand(train_data):
train_data_label = pd.DataFrame()
train_data_label['text'] = train_data['text']
train_data_label['label'] = label_fill(train_data)
coutn = 600
for i in range(len(train_data_label)):
templist = train_data_label['label'][i].split()
'''
判断标签中是否包含以下标签,倍数表示复制样本倍数,作为数据增强的一种
3 3倍
5 3倍
6 5 倍
10 3 倍
12 3倍
16 2 倍
14 2倍
9 2倍
'''
# key是标签类别,value是复制的次数
ndict = {'3':3,'5':3,'6':5,'10':3,'12':3,'13':3,'16':2,'14':2,'9':2}
for key, value in ndict.items():
# 上采样
if key in templist:
temp_df = train_data_label.iloc[i]
for t in range(value):
train_data_label.loc[train_data_label.shape[0]] = temp_df
# 下采样
for i in range(len(train_data_label)):
templist = train_data_label['label'][i].split()
if ('15' in templist)and (coutn<=1668):
train_data_label.drop([i],inplace=True)
coutn +=1
return train_data_label