# 数据科学比赛经历分享——风机开裂故障预警比赛

02 数据介绍

1import seaborn as snsimport pandas as pd
2data_file = r"D:\fan_fault\feature1.csv"
3pre_process = pd.read_csv(data_file, encoding = "gbk")
4
5pre_process = pre_process.fillna(0)
6feature1_plot = pre_process["normal(0)"]
7
8feature2_plot2 = pre_process["fault(1)"]

03 数据分析

04 特征工程

1import os
2import pandas as pd
3import numpy as np
4import csv
5
6label_file = r"C:\fan_fault\train\trainX"
7train_mean = r"D:\fan_fault\train_mean_new.csv"
8
9with open(train_mean, "a", newline = '', encoding = "utf-8") as f:
10 train_mean = csv.writer(f)
11
12 for x in range(1, 48340):
13 fan_file = os.path.join(label_file, str(x) + ".csv")
14 print("程序运行进度为", x/48340) #用该语句查看工作进度状态
15
16 with open(fan_file, encoding='utf-8') as f:
18 #遍历打开文件的每一个特征（72），求取均值
19 # a用来临时存放计算好的特征均值，外加一个label
20
21 a = []
22 for i in range(72):
24 mean_num = np.array(mean_num).mean()
25 #生成每个特征所有数据对应的均值
26 a.append(mean_num)
27
28 train_mean.writerow(a)

1# -*- coding: utf-8 -*-"""
2
3import numpy as np
4import pandas as pd
5from sklearn.preprocessing import MinMaxScaler
6from sklearn.ensemble import RandomForestClassifier
7from sklearn.model_selection import cross_val_scorefrom sklearn
8import metrics
9from sklearn.model_selection import GridSearchCV
10
11#数据导入、检查空缺值
14data.info()
15data.notnull().sum(axis=0)/data.shape[0]
16train = data.iloc[:,:-1]
17label = label.iloc[:,-1]
18
19#数据标准化
20scaler = MinMaxScaler()
21train = scaler.fit(train).transform(train)
22
23#单个分类器
24clf = RandomForestClassifier(random_state=14)
25f1 = cross_val_score(clf, train, label, scoring='f1')
26print("f1:{0:.1f}%".format(np.mean(f1)*100))
27
28#调参
29parameter_space = {
30 'n_estimators':range(10,200,10),
31 'max_depth':range(1,10),
32 'min_samples_split':range(2,10),
33 }
34clf = RandomForestClassifier(random_state=14)
35grid = GridSearchCV(clf,parameter_space,scoring='f1', n_jobs = 6)
36grid.fit(train,label)
37print("f1:(0:.1f)%".format(grid.best_score_*100))
38print(grid.best_estimator_)
39
40#调参后的分类器
41new_clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
42 max_depth=7, max_features='auto', max_leaf_nodes=None,
43 min_impurity_decrease=0.0, min_impurity_split=None,
44 min_samples_leaf=1, min_samples_split=7,
45 min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
46 oob_score=False, random_state=14, verbose=0,warm_start=False)
47print("f1:{0:.1f}%".format(np.mean(f1)*100))

1#数据标准化
2scaler = MinMaxScaler()
3train = scaler.fit(train).transform(train)
4test = scaler.fit(test).transform(test)
5
6#训练分类器
7clf = RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
8 max_depth=8, max_features='auto', max_leaf_nodes=None,
9 min_impurity_decrease=0.0, min_impurity_split=None,
10 min_samples_leaf=1, min_samples_split=5,
11 min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=5,
12 oob_score=False, random_state=14, verbose=0, warm_start=False)
13clf = clf.fit(train, label)
14#预测结果
15pre = clf.predict(test)
16
17#测试结果文件写入
18import csv
19
20label = r"D:/fan_fault/label.csv"
21
22with open(label, "w", newline = '', encoding = "utf-8") as f:
23 label = csv.writer(f)
24
25 for x in range(len(pre)):
26 label.writerow(pre[x:x+1])

1# 交叉特征有2844列，分别是自身的平方和相互交叉，最后求均值方差，最后三个特征不单独再生成交叉特征
2
3import os
4import pandas as pd
5import numpy as np
6import csv
7from sklearn.preprocessing import PolynomialFeatures
8
9label_file = r"F:\User\Xinyuan Huang\train_labels.csv"
10fan_folder = r"F:\User\Xinyuan Huang"
12
13cross_var = r"F:\User\Xinyuan Huang\CaiJi\Feature_crosses\cross_var.csv"
14
15with open(cross_var, "a", newline = '', encoding = "utf-8") as f:
16 cross_var = csv.writer(f)
17
18 # 该for循环用于定位要打开的文件
21 #遍历DataFrame第一列的f_id标签下面的每一个数
23 #遍历DataFrame第二列的file_name标签下面的每一个数
25 #遍历DataFrame第三列的ret标签下面的每一个数
26
27 f_id = column1.split()[1]
28 #第一行的文件所对应的f_id进行切片操作，获取对应的数字
29 # 对f_id进行补0操作
30 f_id = f_id.zfill(3)
31 # 比如2补成002，所以这里写3
32 file_name = column2.split()[1]
33 #第一行的文件所对应的file_name
34 label = column3.split()[1]
35 #第一行文件所对应的ret
36
37 fan_file = os.path.join(fan_folder, "train", f_id, file_name)
39 #用该语句查看工作进度状态
40
41 # 打开相应的fan_file文件进行读取操作
42 with open(fan_file, encoding='utf-8') as f:
44 #数据集名称为dataset
45 poly = PolynomialFeatures(degree=2, include_bias=False,interaction_only=False)
46 X_ploly = poly.fit_transform(dataset)
47 data_ploly = pd.DataFrame(X_ploly, columns=poly.get_feature_names())
48
49 new_data = data_ploly.ix[:,75:-6]
50
51 #ploly_mean,ploly_var为交叉特征均值方差
52 ploly_mean = np.mean(new_data)
53 ploly_var = np.var(ploly_mean)
54
55 ploly_var = list(ploly_var)
56 ploly_var.append(label)
57
58 cross_var.writerow(ploly_var)

1import os
2import pandas as pd
3import numpy as np
4import csv
5
6label_file = r"E:\8_19\testX_csv"
7
8train_mean = r"E:\8_19\disperse\discrete56.csv"
9
10with open(train_mean, "a", newline = '', encoding = "utf-8") as f:
11 train_mean = csv.writer(f)
12
13 for x in range(1, 451):
14 fan_file = os.path.join(label_file, str(x) + ".csv")
15# print("程序运行进度为", x/451) #用该语句查看工作进度状态
16
17 with open(fan_file, encoding='utf-8') as f:
19
20 num1 = 0
21 num2 = 0
22 num3 = 0
23
24 a = []
25
28 num1 = num1+1
30 num2 = num2+1
32 num3 = num3+1
33
37
38 a.append(num1)
39 a.append(num2)
40 a.append(num3)
41
42 train_mean.writerow(a)

05 算法

1from xgboost import XGBClassifier
2import xgboost as xgb
3
4import pandas as pd
5import numpy as np
6
7from sklearn.model_selection import GridSearchCV
8from sklearn.model_selection import StratifiedKFold
9
10from sklearn.metrics import log_loss
11from sklearn.preprocessing import MinMaxScaler
12
13
14#数据导入、检查空缺值
17test = pd.read_csv(r"D:\next\8_19\test_data.csv", encoding = "gbk")
18train = data.iloc[:,:-1]
19label = label.iloc[:,-1]
20
21X_train = train
22y_train = label
23
24#数据标准化
25scaler = MinMaxScaler()
26train = scaler.fit(train).transform(train)
27test = scaler.fit(test).transform(test)
28
29#交叉验证
30kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=3)
31
32param_test1 = {
33 'max_depth':list(range(3,10,1)),
34 'min_child_weight':list(range(1,6,1))
35}
36gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=400, max_depth=5,
37 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
38 objective= 'binary:logistic', nthread=4, scale_pos_weight=1, seed=27),
39 param_grid = param_test1, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
40gsearch1.fit(X_train,y_train)
41gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

1aram_test1 = {
2 'learning_rate':[i/100.0 for i in range(6,14,2)]
3}
4gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=400, max_depth=6,
5 min_child_weight=1, gamma=0, subsample=0.8, colsample_bytree=0.8,
6 objective= 'binary:logistic', nthread=6, scale_pos_weight=1, seed=27),
7 param_grid = param_test1, scoring='roc_auc',n_jobs=-1,iid=False, cv=5)
8gsearch1.fit(X_train,y_train)
9gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_
10
11param_test1 = {
12 'subsample':[0.8, 0.9]
13}
14gsearch1 = GridSearchCV(estimator = XGBClassifier( learning_rate =0.1, n_estimators=310, max_depth=6,
15 min_child_weight=1, gamma=0, subsample=0.9, colsample_bytree=0.8,
16 objective= 'binary:logistic', nthread=6, scale_pos_weight=1, seed=27),
17 param_grid = param_test1, scoring='roc_auc',n_jobs=-1,iid=False, cv=5)
18gsearch1.fit(X_train,y_train)
19gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

1import xgboost as xgb
2dtrain=xgb.DMatrix(X_train ,label=y_train)
3dtest=xgb.DMatrix(test)
4
5params={
6 'objective': 'binary:logistic',
7 'max_depth':6,
8 'subsample':0.8,
9 'colsample_bytree':0.8,
10 'min_child_weight':1,
11 'seed':27,
13 'learning_rate':0.1,
14 'n_estimators':292,
15 'gamma':0,
16 'scale_pos_weight':1}
17
18watchlist = [(dtrain,'train')]
19
20bst=xgb.train(params,dtrain,num_boost_round=100,evals=watchlist)
21
22ypred=bst.predict(dtest)
23
24import csv
25
26test_label = r"D:\next\8_20\test_label_new.csv"
27with open(test_label, "a", newline = '', encoding = "utf-8") as f:
28 test_label = csv.writer(f)
29
30 for x in range(len(ypred)):
31 a = []
32 if ypred[x] < 0.5:
33 a.append(0)
34 test_label.writerow(a)
35 else:
36 a.append(1)
37 test_label.writerow(a)

1# -*- coding: utf-8 -*-
2"""
3Created on Fri May 18 14:09:06 2018
4
6"""
7
8import numpy as np
9import pandas as pd
10from sklearn.preprocessing import MinMaxScaler
11import xgboost as xgb
12from random import shuffle
13from xgboost.sklearn import XGBClassifier
14from sklearn.cross_validation import cross_val_score
15import pickle
16import time
17from hyperopt import fmin, tpe, hp,space_eval,rand,Trials,partial,STATUS_OK
18import random
19
20data = pd.read_csv(r'D:\next\select_data\new_feature.csv', encoding = "gbk").values
22labels = label.reshape((1,-1))
23label = labels.tolist()[0]
24
25minmaxscaler = MinMaxScaler()
26attrs = minmaxscaler.fit_transform(data)
27
28index = range(0,len(label))
29random.shuffle(label)
30trainIndex = index[:int(len(label)*0.7)]
31print (len(trainIndex))
32testIndex = index[int(len(label)*0.7):]
33print (len(testIndex))
34attr_train = attrs[trainIndex,:]
35print (attr_train.shape)
36attr_test = attrs[testIndex,:]
37print (attr_test.shape)
38label_train = labels[:,trainIndex].tolist()[0]
39print (len(label_train))
40label_test = labels[:,testIndex].tolist()[0]
41print (len(label_test))
42print (np.mat(label_train).reshape((-1,1)).shape)
43
44
45def GBM(argsDict):
46 max_depth = argsDict["max_depth"] + 5
47# n_estimators = argsDict['n_estimators'] * 5 + 50
48 n_estimators = 627
49 learning_rate = argsDict["learning_rate"] * 0.02 + 0.05
50 subsample = argsDict["subsample"] * 0.1 + 0.7
51 min_child_weight = argsDict["min_child_weight"]+1
52
53 print ("max_depth:" + str(max_depth))
54 print ("n_estimator:" + str(n_estimators))
55 print ("learning_rate:" + str(learning_rate))
56 print ("subsample:" + str(subsample))
57 print ("min_child_weight:" + str(min_child_weight))
58
59 global attr_train,label_train
60
62 max_depth=max_depth, #最大深度
63 n_estimators=n_estimators, #树的数量
64 learning_rate=learning_rate, #学习率
65 subsample=subsample, #采样数
66 min_child_weight=min_child_weight, #孩子数
67
68 max_delta_step = 50, #50步不降则停止
69 objective="binary:logistic")
70
71 metric = cross_val_score(gbm,attr_train,label_train,cv=3, scoring="f1", n_jobs = -1).mean()
72 print (metric)
73 return -metric
74
75space = {"max_depth":hp.randint("max_depth",15),
76 "n_estimators":hp.quniform("n_estimators",100,1000,1), #[0,1,2,3,4,5] -> [50,]
77 #"learning_rate":hp.quniform("learning_rate",0.01,0.2,0.01), #[0,1,2,3,4,5] -> 0.05,0.06
78 #"subsample":hp.quniform("subsample",0.5,1,0.1),#[0,1,2,3] -> [0.7,0.8,0.9,1.0]
79 #"min_child_weight":hp.quniform("min_child_weight",1,6,1), #
80
81 #"max_depth":hp.randint("max_depth",15),
82 # "n_estimators":hp.randint("n_estimators",10), #[0,1,2,3,4,5] -> [50,]
83 "learning_rate":hp.randint("learning_rate",6), #[0,1,2,3,4,5] -> 0.05,0.06
84 "subsample":hp.randint("subsample",3),#[0,1,2,3] -> [0.7,0.8,0.9,1.0]
85 "min_child_weight":hp.randint("min_child_weight",2)
86
87 }
88algo = partial(tpe.suggest,n_startup_jobs=1)
89best = fmin(GBM,space,algo=algo,max_evals=50) #max_evals表示想要训练的最大模型数量，越大越容易找到最优解
90
91print (best)
92print (GBM(best))

06 最终结果

07 总结

Python爱好者

+ 订阅