git:逻辑回归二分类器
已知数据集 testSet.txt 中数据格式如下:
设第一列特征为x1,第二列特征为x2,第三列标签为z
每一个特征都乘上一个回归系数w,则有
z=w0x0+w1x1+w2x2(x0=1)
z=w0x0+w1x1+w2x2(x0=1)
用向量表示法,可记为
z=WTX
z=WTX
将z代入Sigmoid函数中,得:
σ(z)=11+e−z,z=WTX
σ(z)=11+e−z,z=WTX
σ(WTX)=11+e−WTX
σ(WTX)=11+e−WTX
Sigmoid函数由于其图像特点,可以很方便的执行二分类的任务,大于0.5的数据归为一类,小于0.5的数据归为另一类.
Sigmoid函数:
import matplotlib.pyplot as plt import numpy as np def sigmoid(z): return 1 / (1 + np.exp(-z)) nums = np.arange(-5, 5, step=0.3) fig = plt.figure(figsize=(12, 4)) ax = fig.add_subplot(111) ax.plot(nums, sigmoid(nums), 'r') plt.show()
梯度上升法:
记梯度为,则函数的梯度为
∇f(x,y)=⎛⎝∂f(x,y)∂x∂f(x,y)∂y⎞⎠
∇f(x,y)=(∂f(x,y)∂x∂f(x,y)∂y)
梯度代表了函数变化的方向,记为函数变化的大小,也称“步长”,则梯度上升算法的迭代公式为:
w:=w+α∇wf(w)
w:=w+α∇wf(w)
⇒w:=w+α(z−σ(z))X
⇒w:=w+α(z−σ(z))X
对应的梯度下降的迭代公式为
w:=w−α∇wf(w)
w:=w−α∇wf(w)
⇒w:=w−α(z−σ(z))X
⇒w:=w−α(z−σ(z))X
由此,我们就可以通过梯度上升法来寻找最佳的回归系数。
使用 Matplotlib 绘出数据点:
import TxtToNumpy dataMat, labelList = TxtToNumpy.TxtToNumpy("testSet.txt") type0_x = []; type0_y = [] type1_x = []; type1_y = [] for i in range(len(labelList)): if labelList[i] == 0: type0_x.append(dataMat[i][0]) type0_y.append(dataMat[i][1]) if labelList[i] == 1: type1_x.append(dataMat[i][0]) type1_y.append(dataMat[i][1]) fig = plt.figure(figsize = (8, 4)) ax = fig.add_subplot(111) type0 = ax.scatter(type0_x, type0_y, s = 30, c = 'r') type1 = ax.scatter(type1_x, type1_y, s = 30, c = 'b') ax.set_xlabel("X1") ax.set_ylabel("X2") ax.legend((type0, type1), ("Class 0", "Class 1"), loc=0) plt.show()
TxtToNumpy.py 模块:
from numpy import * def TxtToNumpy(filename): file = open(filename) file_lines_list = file.readlines() number_of_file_lines = len(file_lines_list) dataMat = zeros((number_of_file_lines, 3)) labelList = [] index = 0 for line in file_lines_list: line = line.strip() line_list = line.split('\t') dataMat[index, :] = line_list[0:3] labelList.append(int(line_list[-1])) index += 1 return dataMat, labelList if __name__ == "__main__": print("Code Run As A Program")
画出决策边界:
①批处理梯度上升法求权重,进而画出决策边界:
批处理梯度上升法求权重时,每次更新回归系数都需要遍历整个数据集,因此准确度也最高,但计算复杂度也非常高。
BpGradientAscent.py 模块:
# coding: utf-8 #Batch Processing Gradient Ascent import numpy as np import matplotlib.pyplot as plt #将txt文件中储存的数据和标签分别存储在列表dataMat和labelMat中 def loadDataSet(filename): dataList = [] labelList = [] fr = open(filename) for line in fr.readlines(): #将每一行的各个元素取出存放在列表lineArr中 lineArr = line.strip().split() #[ , , ]中三个参数代表了公式 z = W^T X中的X,第一个X的值为1 dataList.append([1.0, float(lineArr[0]), float(lineArr[1])]) labelList.append(int(lineArr[2])) return dataList, labelList #sigmoid函数,用于分类 def sigmoid(z): return 1.0 / (1 + np.exp(-z)) #batch Processing Gradient Ascent,批处理梯度上升求权重W; alpha表示步长, maxCycles表示梯度上升算法的最大迭代次数 def bpGradientAscent(filename, alpha=0.001, maxCycles=500): dataList, labelList = loadDataSet(filename) dataMatrix = np.mat(dataList) #teanspose()用于矩阵转置 labelMatrix = np.mat(labelList).transpose() m, n = np.shape(dataMatrix) weights = np.ones((n, 1)) for i in range(maxCycles): sig = sigmoid(dataMatrix * weights) error = labelMatrix - sig weights = weights + alpha * dataMatrix.transpose() * error #getA()将矩阵转换为数组 return weights.getA() #画出决策边界 def decisionBoundary(weights, filename): dataMat, labelMat = loadDataSet(filename) dataArr = np.array(dataMat) n = np.shape(dataArr)[0] type0_x = []; type0_y = [] type1_x = []; type1_y = [] for i in range(n): if labelMat[i] == 0: type0_x.append(dataMat[i][1]) type0_y.append(dataMat[i][2]) if labelMat[i] == 1: type1_x.append(dataMat[i][1]) type1_y.append(dataMat[i][2]) fig = plt.figure(figsize = (8, 4)) ax = fig.add_subplot(111) type0 = ax.scatter(type0_x, type0_y, s = 30, c = 'r') type1 = ax.scatter(type1_x, type1_y, s = 30, c = 'b') x1 = np.arange(-4.5, 4.5, 0.1) x2 = (-weights[0]-weights[1]*x1) / weights[2] ax.set_xlabel("X1") ax.set_ylabel("X2") ax.legend((type0, type1), ("Class 0", "Class 1"), loc=0) ax.plot(x1, x2) plt.show() if __name__ == "__main__": print("Code Run as a Program!")
调用该 BpGradientAscent.py 模块:
import matplotlib.pyplot as plt import numpy as np import BpGradientAscent BpGradientAscent.decisionBoundary(BpGradientAscent.bpGradientAscent("testSet.txt"), "testSet.txt")
得到决策边界(蓝线):
②小批量随机梯度上升法求权重,进而画出决策边界:
小批量随机梯度上升法求权重时,每次更新回归系数只需要选取一部分数据,准确度相对于批处理梯度上升法有所降低,但计算复杂度相对也降低很多,可以通过调整步长和最大迭代次数来提供决策边界的准确度。
SbsGradientAscent.py 模块:
# coding: utf-8 #Small Batch Stochastic Gradient Ascent import numpy as np import matplotlib.pyplot as plt #将txt文件中储存的数据和标签分别存储在列表dataMat和labelMat中 def loadDataSet(filename): dataList = [] labelList = [] fr = open(filename) for line in fr.readlines(): #将每一行的各个元素取出存放在列表lineArr中 lineArr = line.strip().split() #[ , , ]中三个参数代表了公式 z = W^T X中的X,第一个X的值为1 dataList.append([1.0, float(lineArr[0]), float(lineArr[1])]) labelList.append(int(lineArr[2])) return dataList, labelList #sigmoid函数,用于分类 def sigmoid(z): return 1.0 / (1 + np.exp(-z)) #small Batch Stochastic Gradient Ascent,小批量随机梯度上升求权重;maxCycles表示梯度上升算法的最大迭代次数 def sbsGradientAscent(filename, maxCycles = 300): dataList, labelList = loadDataSet(filename) m, n = np.shape(dataList) weights = np.ones(n) for i in range(maxCycles): dataIndex = range(m) for j in range(m): #alpha表示步长 alpha = 4 / (1.0 + i + j) + 0.001 #uniform()表示在参数范围内随机取值 randomIndex = int(np.random.uniform(0, len(dataIndex))) error = labelList[randomIndex] - sigmoid(sum(dataList[randomIndex] * weights)) weights = weights + alpha * error * np.array(dataList[randomIndex]) #从列表中移除刚刚被随机选取的值 del(list(dataIndex)[randomIndex]) return weights def decisionBoundary(weights, filename): dataMat, labelMat = loadDataSet(filename) dataArr = np.array(dataMat) n = np.shape(dataArr)[0] type0_x = []; type0_y = [] type1_x = []; type1_y = [] for i in range(n): if labelMat[i] == 0: type0_x.append(dataMat[i][1]) type0_y.append(dataMat[i][2]) if labelMat[i] == 1: type1_x.append(dataMat[i][1]) type1_y.append(dataMat[i][2]) fig = plt.figure(figsize = (8, 4)) ax = fig.add_subplot(111) type0 = ax.scatter(type0_x, type0_y, s = 30, c = 'r') type1 = ax.scatter(type1_x, type1_y, s = 30, c = 'b') x1 = np.arange(-4.5, 4.5, 0.1) x2 = (-weights[0]-weights[1]*x1) / weights[2] ax.set_xlabel("X1") ax.set_ylabel("X2") ax.legend((type0, type1), ("Class 0", "Class 1"), loc=0) ax.plot(x1, x2) plt.show() if __name__ == "__main__": print("Code Run as a Program!")
调用 SbsGradientAscent.py 模块:
import matplotlib.pyplot as plt import numpy as np import SbsGradientAscent SbsGradientAscent.decisionBoundary(SbsGradientAscent.sbsGradientAscent("testSet.txt"), "testSet.txt")
得到决策边界(蓝线):
BpGradientAscent.py 模块 和 SbsGradientAscent.py 模块的不同之处在于其中的 bpGradientAscent()函数和 sbsGradientAscent()函数 不同,分别表示 批处理梯度上升求权重 和 小批量随机梯度上升求权重
bpGradientAscent()函数:
#batch Processing Gradient Ascent,批处理梯度上升求权重W; alpha表示步长, maxCycles表示梯度上升算法的最大迭代次数 def bpGradientAscent(filename, alpha=0.001, maxCycles=500): dataList, labelList = loadDataSet(filename) dataMatrix = np.mat(dataList) #teanspose()用于矩阵转置 labelMatrix = np.mat(labelList).transpose() m, n = np.shape(dataMatrix) weights = np.ones((n, 1)) for i in range(maxCycles): sig = sigmoid(dataMatrix * weights) error = labelMatrix - sig weights = weights + alpha * dataMatrix.transpose() * error #getA()将矩阵转换为数组 return weights.getA()
sbsGradientAscent()函数:
#small Batch Stochastic Gradient Ascent,小批量随机梯度上升求权重;maxCycles表示梯度上升算法的最大迭代次数 def sbsGradientAscent(filename, maxCycles = 300): dataList, labelList = loadDataSet(filename) m, n = np.shape(dataList) weights = np.ones(n) for i in range(maxCycles): dataIndex = range(m) for j in range(m): #alpha表示步长 alpha = 4 / (1.0 + i + j) + 0.001 #uniform()表示在参数范围内随机取值 randomIndex = int(np.random.uniform(0, len(dataIndex))) error = labelList[randomIndex] - sigmoid(sum(dataList[randomIndex] * weights)) weights = weights + alpha * error * np.array(dataList[randomIndex]) #从列表中移除刚刚被随机选取的值 del(list(dataIndex)[randomIndex]) return weights
最终得到的决策边界(蓝线)为:
bpGradientAscent()函数:
sbsGradientAscent()函数: