1 基础
X,y = make_blobs(n_samples=200,centers=2,cluster_std=5) X_train, X_test,y_train, y_test = train_test_split(X, y, random_state=38) scaler =StandardScaler().fit(X_train) X_train_scaled =scaler.transform(X_train) X_test_scaled =scaler.transform(X_test) print("训练集形态:",X_train_scaled.shape) print("测试集形态:",X_test_scaled.shape) #原始的训练集 plt.scatter(X_train[:,0],X_train[:,1]) #经过预处理的训练集 plt.scatter(X_train_scaled[:,0],X_train_scaled[:,1],marker='^',edgecolor='k') plt.title(u"训练集 VS 处理后的训练集") plt.rcParams['font.sans-serif']=['SimHei'] plt.rcParams['axes.unicode_minus']=False plt.show()
输出
训练集形态: (150, 2) 测试集形态: (50, 2)
经过StandardScaler()处理后的数据跟密集,如图中三角。通过网格搜索。
params ={'hidden_layer_sizes':[(50,),(100,),(100,100)],"alpha":[0.0001,0.001,0.01]} grid =GridSearchCV(MLPClassifier(max_iter=1600,random_state=38),param_grid=params,cv=3) grid.fit(X_train_scaled,y_train) print("模型最高得分:\n使用{:.2%}".format(grid.best_score_)) print("模型最高得分时的参数:\n{}".format(grid.best_params_)) #打印模型在测试集上的得分 print("测试集得分:\n{:.2%}".format(grid.score(X_test_scaled,y_test)))
输出
模型最高得分: 90.00% 模型最高得分时的参数: 'alpha': 0.0001, 'hidden_layer_sizes': (50,)} 测试集得分: 82.00%
2 管道技术
使用管道技术 pipeline = Pipeline([('scaler',StandardScaler()), ('mlp',MLPClassifier(max_iter=1600,random_state=38))]) pipeline.fit(X_train,y_train) print("使用管道后的测试集得分:\n{:.2%}".format(pipeline.score(X_test,y_test)))
输出
使用管道后的测试集得分: 86.00%
管道并且结合网格搜索
params = {'mlp__hidden_layer_sizes':[(50,),(100,),(100,100)],"mlp__alpha":[0.0001,0.001,0.01]} grid =GridSearchCV(pipeline,param_grid=params,cv=3) grid.fit(X_train,y_train) print("交叉验证最高得分:\n{:.2%}".format(grid.best_score_)) print("模型最优参数:\n{}".format(grid.best_params_)) print("测试集得分:\n{:.2%}".format(grid.score(X_test,y_test)))
输出
交叉验证最高得分: 90.00% 模型最优参数: 'mlp__alpha': 0.0001, 'mlp__hidden_layer_sizes': (50,)} 测试集得分: 82.00%
虽然使用pipe得分降低,但是它用免除了错误
GridSearchCV拆分训练集和验证集,不是train_test_split拆分的训练集和验证集,而是在train_test_split拆分基础上再拆分。
3 案例
#使用管道,Pipeline()方法与make_pipeline()等同 pipeline =Pipeline([('scaler',StandardScaler()), ('mlp',MLPRegressor(max_iter=1600,hidden_layer_sizes=[1,1],random_state=6))]) pipe =make_pipeline(StandardScaler(),MLPRegressor(max_iter=1600,hidden_layer_sizes=[1,1],random_state=6)) scores =cross_val_score(pipe,X,y,cv=20) print("pipe处理后模型平均分:{:.2%}".format(float(scores.mean())))
输出
pipe处理后模型平均分:-80419.24%
用SelectFromModel进行优化
pipe =make_pipeline(StandardScaler(),SelectFromModel(RandomForestRegressor(random_state=6)), MLPRegressor(max_iter=1600,hidden_layer_sizes=[1,1],random_state=6)) scores =cross_val_score(pipe,X,y,cv=20) print(“经过pipe处理后,再经过SelectFromModel处理,模型平均分:{:.2%}".format(float(scores.mean())))
输出
经过pipe处理后,再经过SelectFromModel处理,模型平均分:-56190.48%
接下来使用GridSearchCV
params =[{'reg':[MLPRegressor(max_iter=1600,hidden_layer_sizes=[1,1],random_state=6)], 'scaler':[StandardScaler(),None]}, {'reg':[RandomForestRegressor(random_state=6)], 'scaler':[None]}] pipe = Pipeline([('scaler',StandardScaler()),('reg',MLPRegressor())]) grid =GridSearchCV(pipe,params,cv=6) grid.fit(X,y) print("GridSearchCV处理后,最佳模型是:{}".format(grid.best_params_)) print("GridSearchCV处理后,模型最佳得分:{:.2%}".format(grid.best_score_))
输出
GridSearchCV处理后,最佳模型是:{'reg': RandomForestRegressor(random_state=6), 'scaler': None} GridSearchCV处理后,模型最佳得分:-12.45%
接下来调参
params =[{'reg':[MLPRegressor(max_iter=1600,random_state=6)], 'scaler':[StandardScaler(),None], 'reg__hidden_layer_sizes':[(1),(50,),(100,),(1,1),(50,50),(100,100)]}, {'reg':[RandomForestRegressor(random_state=6)], 'scaler':[None], 'reg__n_estimators':[10,50,100]}] pipe = Pipeline([('scaler',StandardScaler()),('reg',MLPRegressor())]) grid =GridSearchCV(pipe,params,cv=6) grid.fit(X,y) print("加入参数后,最佳模型是:{}".format(grid.best_params_)) print("加入参数后,模型最佳得分:{:.2%}".format(grid.best_score_))
输出
加入参数后,最佳模型是:{'reg': RandomForestRegressor(random_state=6),'reg__n_estimators': 100, 'scaler': None} 加入参数后,模型最佳得分:-12.45%
看样子-12.45%为最高得分。这个结果不是让我很满意,我们用所学过的所有算法和参数进行一下遍历,从而找出最佳的算法及其参数。
def get_better_score(): warnings.filterwarnings("ignore") n_jobs = 2 params=[{'reg':[LinearRegression()],'scaler':[StandardScaler(),MinMaxScaler(),None],"reg__n_jobs":[n_jobs]}, {'reg':[LogisticRegression()],'scaler':[StandardScaler(),MinMaxScaler(),None],"reg__n_jobs":[n_jobs]}, {'reg':[Ridge()],'scaler':[StandardScaler(),MinMaxScaler(),None],"reg__alpha":[1,0.1,0.001,0.0001]}, {'reg':[Lasso()],'scaler':[StandardScaler(),MinMaxScaler(),None],"reg__alpha":[1,0.1,0.001,0.0001]}, {'reg':[ElasticNet()],'scaler':[StandardScaler(),MinMaxScaler(),None],"reg__alpha":[0.1,0.5,1,5,10],"reg__l1_ratio":[0.1,0.5,0.9]}, {'reg':[RandomForestClassifier()],'scaler':[StandardScaler(),MinMaxScaler(),None],"reg__n_estimators":[4,5,6,7],"reg__random_state":[2,3,4,5],"reg__n_jobs":[n_jobs],"reg__random_state":[range(0,200)]}, {'reg':[RandomForestRegressor()],'scaler':[StandardScaler(),MinMaxScaler(),None],"reg__n_estimators":[4,5,6,7],"reg__random_state":[2,3,4,5],"reg__n_jobs":[n_jobs],"reg__random_state":[range(0,200)]}, {'reg':[DecisionTreeClassifier()],'scaler':[StandardScaler(),MinMaxScaler(),None],"reg__max_depth":[1,3,5,7],"reg__random_state":[range(1,200)]}, {'reg':[DecisionTreeRegressor()],'scaler':[StandardScaler(),MinMaxScaler(),None],"reg__max_depth":[1,3,5,7],"reg__random_state":[range(1,200)]}, {'reg':[KNeighborsClassifier()],'scaler':[StandardScaler(),MinMaxScaler(),None],"reg__n_jobs":[n_jobs]}, {'reg':[KNeighborsRegressor()],'scaler':[StandardScaler(),MinMaxScaler(),None],"reg__n_jobs":[n_jobs]}, {'reg':[BernoulliNB()],'scaler':[StandardScaler(),MinMaxScaler(),None]}, {'reg':[GaussianNB()],'scaler':[StandardScaler(),MinMaxScaler(),None]}, {'reg':[MultinomialNB()],'scaler':[MinMaxScaler()]}, {'reg':[SVC(max_iter=10000)],'scaler':[StandardScaler(),MinMaxScaler(),None],"reg__kernel":["linear","rbf","sigmoid","poly"],"reg__gamma":[0.01,0.1,1,5,10],"reg__C":[1.0,3.0,5.0]}, {'reg':[SVR(max_iter=100000)],'scaler':[StandardScaler(),MinMaxScaler(),None],"reg__kernel":["linear","rbf","sigmoid","poly"],"reg__gamma":[0.01,0.1,1,5,10],"reg__C":[1.0,3.0,5.0]}, {'reg':[LinearSVC(max_iter=100000)],'scaler':[StandardScaler(),MinMaxScaler(),None],"reg__C":[1.0,3.0,5.0]}, {'reg':[LinearSVR(max_iter=100000)],'scaler':[StandardScaler(),MinMaxScaler(),None],"reg__C":[1.0,3.0,5.0]}, {'reg':[AdaBoostClassifier()],'scaler':[StandardScaler(),MinMaxScaler(),None],"reg__random_state":[range(1,200)]}, {'reg':[AdaBoostRegressor()],'scaler':[StandardScaler(),MinMaxScaler(),None],"reg__random_state":[range(1,200)]}, {'reg':[VotingClassifier(estimators=[('log_clf',LogisticRegression()),('svm_clf', SVC(probability=True)),('dt_clf',DecisionTreeClassifier(random_state=666))])],'scaler':[StandardScaler(),MinMaxScaler(),None],"reg__voting":["hard","soft"],"reg__n_jobs":[n_jobs]}, {'reg':[LinearDiscriminantAnalysis(n_components=2)],'scaler':[StandardScaler(),MinMaxScaler(),None]}, {'reg':[MLPClassifier(max_iter=100000)],'scaler':[StandardScaler(),MinMaxScaler(),None],"reg__activation":["relu","tanh","identity","logistic"],"reg__alpha":[0.0001,0.001,0.01,1],"reg__hidden_layer_sizes":[(1),(50,),(100,),(1,1),(50,50),(100,100)]}, {'reg':[MLPRegressor(max_iter=100000)],'scaler':[StandardScaler(),MinMaxScaler(),None],"reg__activation":["relu","tanh","identity","logistic"],"reg__alpha":[0.0001,0.001,0.01,1],"reg__hidden_layer_sizes":[(1),(50,),(100,),(1,1),(50,50),(100,100)]} ] stock =pd.read_csv('stock1.csv',encoding='GBK') X = stock.loc[:,'价格':'流通市值'] y = stock['涨跌幅'] pipe =Pipeline([('scaler',StandardScaler()),('reg',MLPRegressor())]) shuffle_split =ShuffleSplit(test_size=.2,train_size=.7,n_splits=10) grid =GridSearchCV(pipe,params,cv=shuffle_split) grid.fit(X,y) print("最佳模型是:{}".format(grid.best_params_)) print("模型最佳训练得分:{:.2%}".format(grid.best_score_)) print("模型最佳测试得分:{:.2%}".format(grid.score(X,y)))
输出
最佳模型是:{'reg':LinearRegression(n_jobs=2), 'reg__n_jobs': 2, 'scaler': StandardScaler()} 模型最佳训练得分:100.00% 模型最佳测试得分:100.00%
得到这个结果好让人意外,我们直接用StandardScaler()后用LinearRegression模型来拟合一下。
def best_stock(): stock =pd.read_csv('stock1.csv',encoding='GBK') X = stock.loc[:,'价格':'流通市值'] y = stock['涨跌幅'] X_train, X_test,y_train, y_test = train_test_split(X, y, random_state=62) clf =LinearRegression(n_jobs=2) scaler = StandardScaler() scaler.fit(X_train) X_train_scaled =scaler.transform(X_train) X_test_scaled =scaler.transform(X_test) clf.fit(X_train_scaled,y_train) print("最佳模型最佳得分:{:.2%}".format(clf.score(X_train_scaled,y_train))) print("最佳模型最佳得分:{:.2%}".format(clf.score(X_test_scaled,y_test)))
输出
模型训练得分:100.00% 模型测试得分:100.00%
不用选择,仅仅通过StandardScaler()缩放,再用LinearRegression进行拟合,最后测试集与训练集得分均为100%
—————————————————————————————————
软件安全测试
https://study.163.com/course/courseMain.htm?courseId=1209779852&share=2&shareId=480000002205486
接口自动化测试
https://study.163.com/course/courseMain.htm?courseId=1209794815&share=2&shareId=480000002205486
DevOps 和Jenkins之DevOps
https://study.163.com/course/courseMain.htm?courseId=1209817844&share=2&shareId=480000002205486
DevOps与Jenkins 2.0之Jenkins
https://study.163.com/course/courseMain.htm?courseId=1209819843&share=2&shareId=480000002205486
Selenium自动化测试
https://study.163.com/course/courseMain.htm?courseId=1209835807&share=2&shareId=480000002205486
性能测试第1季:性能测试基础知识
https://study.163.com/course/courseMain.htm?courseId=1209852815&share=2&shareId=480000002205486
性能测试第2季:LoadRunner12使用
https://study.163.com/course/courseMain.htm?courseId=1209980013&share=2&shareId=480000002205486
性能测试第3季:JMeter工具使用
https://study.163.com/course/courseMain.htm?courseId=1209903814&share=2&shareId=480000002205486
性能测试第4季:监控与调优
https://study.163.com/course/courseMain.htm?courseId=1209959801&share=2&shareId=480000002205486
Django入门
https://study.163.com/course/courseMain.htm?courseId=1210020806&share=2&shareId=480000002205486
啄木鸟顾老师漫谈软件测试
https://study.163.com/course/courseMain.htm?courseId=1209958326&share=2&shareId=480000002205486