1 训练数据数量将会变少
2 评估模型时要采用测试集
1.1 准备数据
import numpy as np import pandas as pd import matplotlib.pyplot as plt
#利用pandas显示数据 path = 'ex2data1.txt' data = pd.read_csv(path, header=None, names=['Exam1', 'Exam2', 'Admitted']) data.head()
Exam1 | Exam2 | Admitted | |
0 | 34.623660 | 78.024693 | 0 |
1 | 30.286711 | 43.894998 | 0 |
2 | 35.847409 | 72.902198 | 0 |
3 | 60.182599 | 86.308552 | 1 |
4 | 79.032736 | 75.344376 | 1 |
positive=data[data["Admitted"].isin([1])] negative=data[data["Admitted"].isin([0])]
#准备训练数据 col_num=data.shape[1] X=data.iloc[:,:col_num-1] y=data.iloc[:,col_num-1]
X.insert(0,"ones",1) X.shape
(100, 3)
X=X.values X.shape
(100, 3)
y=y.values y.shape
from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test =train_test_split(X,y,test_size=0.2,random_state=0)
(array([[ 1. , 82.36875376, 40.61825516], [ 1. , 56.2538175 , 39.26147251], [ 1. , 60.18259939, 86.3085521 ], [ 1. , 64.03932042, 78.03168802], [ 1. , 62.22267576, 52.06099195], [ 1. , 62.0730638 , 96.76882412], [ 1. , 61.10666454, 96.51142588], [ 1. , 74.775893 , 89.5298129 ], [ 1. , 67.31925747, 66.58935318], [ 1. , 47.26426911, 88.475865 ], [ 1. , 75.39561147, 85.75993667], [ 1. , 88.91389642, 69.8037889 ], [ 1. , 94.09433113, 77.15910509], [ 1. , 80.27957401, 92.11606081], [ 1. , 99.27252693, 60.999031 ], [ 1. , 93.1143888 , 38.80067034], [ 1. , 70.66150955, 92.92713789], [ 1. , 97.64563396, 68.86157272], [ 1. , 30.05882245, 49.59297387], [ 1. , 58.84095622, 75.85844831], [ 1. , 30.28671077, 43.89499752], [ 1. , 35.28611282, 47.02051395], [ 1. , 94.44336777, 65.56892161], [ 1. , 51.54772027, 46.85629026], [ 1. , 79.03273605, 75.34437644], [ 1. , 53.97105215, 89.20735014], [ 1. , 67.94685548, 46.67857411], [ 1. , 83.90239366, 56.30804622], [ 1. , 74.78925296, 41.57341523], [ 1. , 45.08327748, 56.31637178], [ 1. , 90.44855097, 87.50879176], [ 1. , 71.79646206, 78.45356225], [ 1. , 34.62365962, 78.02469282], [ 1. , 40.23689374, 71.16774802], [ 1. , 61.83020602, 50.25610789], [ 1. , 79.94481794, 74.16311935], [ 1. , 75.01365839, 30.60326323], [ 1. , 54.63510555, 52.21388588], [ 1. , 34.21206098, 44.2095286 ], [ 1. , 90.54671411, 43.39060181], [ 1. , 95.86155507, 38.22527806], [ 1. , 85.40451939, 57.05198398], [ 1. , 40.45755098, 97.53518549], [ 1. , 32.57720017, 95.59854761], [ 1. , 82.22666158, 42.71987854], [ 1. , 68.46852179, 85.5943071 ], [ 1. , 52.10797973, 63.12762377], [ 1. , 80.366756 , 90.9601479 ], [ 1. , 39.53833914, 76.03681085], [ 1. , 52.34800399, 60.76950526], [ 1. , 76.97878373, 47.57596365], [ 1. , 38.7858038 , 64.99568096], [ 1. , 91.5649745 , 88.69629255], [ 1. , 99.31500881, 68.77540947], [ 1. , 55.34001756, 64.93193801], [ 1. , 66.74671857, 60.99139403], [ 1. , 67.37202755, 42.83843832], [ 1. , 89.84580671, 45.35828361], [ 1. , 72.34649423, 96.22759297], [ 1. , 50.4581598 , 75.80985953], [ 1. , 62.27101367, 69.95445795], [ 1. , 64.17698887, 80.90806059], [ 1. , 94.83450672, 45.6943068 ], [ 1. , 77.19303493, 70.4582 ], [ 1. , 34.18364003, 75.23772034], [ 1. , 66.56089447, 41.09209808], [ 1. , 74.24869137, 69.82457123], [ 1. , 82.30705337, 76.4819633 ], [ 1. , 78.63542435, 96.64742717], [ 1. , 32.72283304, 43.30717306], [ 1. , 75.47770201, 90.424539 ], [ 1. , 33.91550011, 98.86943574], [ 1. , 89.67677575, 65.79936593], [ 1. , 57.23870632, 59.51428198], [ 1. , 84.43281996, 43.53339331], [ 1. , 42.26170081, 87.10385094], [ 1. , 49.07256322, 51.88321182], [ 1. , 44.66826172, 66.45008615], [ 1. , 97.77159928, 86.72782233], [ 1. , 51.04775177, 45.82270146]]), array([0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0], dtype=int64), array([[ 1. , 80.19018075, 44.82162893], [ 1. , 42.07545454, 78.844786 ], [ 1. , 35.84740877, 72.90219803], [ 1. , 49.58667722, 59.80895099], [ 1. , 99.8278578 , 72.36925193], [ 1. , 74.49269242, 84.84513685], [ 1. , 69.07014406, 52.74046973], [ 1. , 60.45788574, 73.0949981 ], [ 1. , 50.28649612, 49.80453881], [ 1. , 83.48916274, 48.3802858 ], [ 1. , 34.52451385, 60.39634246], [ 1. , 55.48216114, 35.57070347], [ 1. , 60.45555629, 42.50840944], [ 1. , 69.36458876, 97.71869196], [ 1. , 75.02474557, 46.55401354], [ 1. , 61.37928945, 72.80788731], [ 1. , 50.53478829, 48.85581153], [ 1. , 77.92409145, 68.97235999], [ 1. , 52.04540477, 69.43286012], [ 1. , 76.0987867 , 87.42056972]]), array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1], dtype=int64))
X_train.shape, X_test.shape, y_train.shape, y_test.shape
((80, 3), (20, 3), (80,), (20,))
((80, 3), (20, 3))
1.2 定义假设函数
Sigmoid 函数
def sigmoid(z): return 1 / (1 + np.exp(-z))
#定义假设函数h(x)=1/(1+exp^(-w.Tx)) def h(X,w): z=X@w h=sigmoid(z) return h
1.3 定义代价函数
((100, 3), (100,), (100, 1))
#代价函数构造 def cost(X,w,y): #当X(m,n+1),y(m,),w(n+1,1) y_hat=h(X,w) right=np.multiply(y.ravel(),np.log(y_hat).ravel())+np.multiply((1-y).ravel(),np.log(1-y_hat).ravel()) cost=-np.sum(right)/X.shape[0] return cost
#设置初始的权值 w=np.zeros((X.shape[1],1)) #查看初始的代价 cost(X,w,y)
1.4 定义梯度下降算法
gradient descent(梯度下降)
- 这是批量梯度下降(batch gradient descent)
(100, 1)
def grandient(X,y,iter_num,alpha): y=y.reshape((X.shape[0],1)) w=np.zeros((X.shape[1],1)) cost_lst=[] for i in range(iter_num): y_pred=h(X,w)-y temp=np.zeros((X.shape[1],1)) for j in range(X.shape[1]): right=np.multiply(y_pred.ravel(),X[:,j]) gradient=1/(X.shape[0])*(np.sum(right)) temp[j,0]=w[j,0]-alpha*gradient w=temp cost_lst.append(cost(X,w,y.ravel())) return w,cost_lst
此处进行的调整为:采用train_x, train_y进行训练
((80, 3), (20, 3))
iter_num,alpha=100000,0.001 w,cost_lst=grandient(X_train, y_train,iter_num,alpha)
[<matplotlib.lines.Line2D at 0x1d0f1417d30>]
Xw—X(m,n) w (n,1)
array([[-4.86722837], [ 0.04073083], [ 0.04257751]])
1.5 绘制决策边界
1.6 计算准确率
#在训练集上的准确率 y_train_true=np.array([1 if item>0.5 else 0 for item in h(X_train,w).ravel()]) y_train_true
array([1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0])
#训练集上的误差 np.sum(y_train_true==y_train)/X_train.shape[0]
#在测试集上的准确率 y_p_true=np.array([1 if item>0.5 else 0 for item in h(X_test,w).ravel()]) y_p_true
array([1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1])
array([1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1], dtype=int64)
1.7 试试用Sklearn来解决
from sklearn.linear_model import LogisticRegression clf = LogisticRegression() clf.fit(X_train,y_train)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
#在训练集上的准确率为 clf.score(X_train,y_train)
#在测试集上却只有0.8 clf.score(X_test,y_test)