一步步实现朴素贝叶斯
from collections import Counter from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.metrics import accuracy_score def get_prior(label): prior = [] dict = Counter(label) for i in dict: prior.append(dict[i]/len(label)) print("训练集的先验概率[F,M]:",np.array(prior)) return np.array(prior) prior = get_prior(y_train) n_class = [0,1] def avgs(data,label): return np.array([data[label == i].mean(axis=0) for i in n_class]) avg = avgs(X_train,y_train) print("train_data的均值:\n",avg) def vars(data,label): return np.array([data[label == i].var(axis=0) for i in n_class]) var = vars(X_train,y_train) print("train_data的方差:\n",var) #计算似然度 def Calculate_likelihood(row): return (1/np.sqrt(2*np.pi*var)*np.exp(-(row.reshape(-1,1,2) - avg)**2/(2*var))).prod(axis = 2) a = Calculate_likelihood(X_test2) probs = a * prior prob_sum = probs.sum(axis =1) b = (probs/prob_sum[:,None]).argmax(axis=1) def get_acc(y_test,y_hat): print("准确度:",(y_hat == y_test).sum()/len(y_hat)) print("女生的精确度:",precision_score(y_test, y_hat, average=None)[0],"\n男生的精确度:",precision_score(y_test, y_hat, average=None)[1]) print("女生的召回率:",recall_score(y_test, y_hat, average=None)[0],"\n男生的召回率:",recall_score(y_test, y_hat, average=None)[1]) get_acc(y_test2,b)
封装为类
from collections import Counter class GaussianNB1(): def __init__(self): self.prior = None self.avgs = None self.vars = None self.n_class = [0,1] def get_prior(self,label): prior = [] dict = Counter(label) for i in dict: prior.append(dict[i]/len(label)) return np.array(prior) def avg_s(self,data,label): return np.array([data[label == i].mean(axis=0) for i in self.n_class]) def var_s(self,data,label): return np.array([data[label == i].var(axis=0) for i in self.n_class]) def fit(self,data,label,prior=None): #self.prior = self.get_prior(label) #self.calss = if prior is None: self.prior = self.get_prior(label) else: self.prior = prior self.avgs = self.avg_s(data,label) self.vars = self.var_s(data,label) def Calculate_likelihood(self,data_test): return (1/np.sqrt(2*np.pi*self.vars)*np.exp(-(data_test.reshape(-1,1,2) - self.avgs)**2/(2*self.vars))).prod(axis = 2) def predict(self,data_test): a = self.Calculate_likelihood(data_test) probs = a * self.prior prob_sum = probs.sum(axis =1) return (probs/prob_sum[:,None]).argmax(axis=1) def get_acc(self,y_test,y_hat): print("准确度:",(y_hat == y_test).sum()/len(y_hat)) print("女生的精确度:",precision_score(y_test, y_hat, average=None)[0],"\n男生的精确度:",precision_score(y_test, y_hat, average=None)[1]) #print("女生的召回率:",recall_score(y_test, y_hat, average=None)[0],"\n男生的召回率:",recall_score(y_test, y_hat, average=None)[1]) return((y_hat == y_test).sum()/len(y_hat),precision_score(y_test, y_hat, average=None)[0],precision_score(y_test, y_hat, average=None)[1])
last1,last2,last3 = [],[],[] clf1 = GaussianNB1() clf1.fit(X_train,y_train) print("如果按照训练集计算的先验证概率") y_hat= clf1.predict(X_test1) clf1.get_acc(y_test1,y_hat) for i in [[0.5,0.5],[0.6,0.4],[0.7,0.3],[0.8,0.2]]: print("如果按照",i,"的先验证概率") clf1.fit(X_train,y_train,i) y_hat = clf1.predict(X_test1) a,b,c = clf1.get_acc(y_test1,y_hat) last1.append(a),last2.append(b),last3.append(c)
如果按照训练集计算的先验证概率 准确度: 0.8870967741935484 女生的精确度: 0.825 男生的精确度: 0.9166666666666666 如果按照 [0.5, 0.5] 的先验证概率 准确度: 0.8629032258064516 女生的精确度: 0.7555555555555555 男生的精确度: 0.9240506329113924 如果按照 [0.6, 0.4] 的先验证概率 准确度: 0.8629032258064516 女生的精确度: 0.7446808510638298 男生的精确度: 0.935064935064935 如果按照 [0.7, 0.3] 的先验证概率 准确度: 0.8467741935483871 女生的精确度: 0.7142857142857143 男生的精确度: 0.9333333333333333 如果按照 [0.8, 0.2] 的先验证概率 准确度: 0.8145161290322581 女生的精确度: 0.660377358490566 男生的精确度: 0.9295774647887324
last4,last5,last6 = [],[],[] for i in [[0.5,0.5],[0.6,0.4],[0.7,0.3],[0.8,0.2]]: print("如果按照",i,"的先验证概率") clf1.fit(X_train,y_train,i) y_hat = clf1.predict(X_test2) a,b,c = clf1.get_acc(y_test2,y_hat) last4.append(a),last5.append(b),last6.append(c)
如果按照 [0.5, 0.5] 的先验证概率 准确度: 0.8888888888888888 女生的精确度: 0.6363636363636364 男生的精确度: 0.9705882352941176 如果按照 [0.6, 0.4] 的先验证概率 准确度: 0.8666666666666667 女生的精确度: 0.5833333333333334 男生的精确度: 0.9696969696969697 如果按照 [0.7, 0.3] 的先验证概率 准确度: 0.8555555555555555 女生的精确度: 0.56 男生的精确度: 0.9692307692307692 如果按照 [0.8, 0.2] 的先验证概率 准确度: 0.8111111111111111 女生的精确度: 0.4827586206896552 男生的精确度: 0.9672131147540983
import matplotlib import matplotlib.pyplot as plt # 处理乱码 matplotlib.rcParams['font.sans-serif'] = ['SimHei'] # 用黑体显示中文 fig, ax = plt.subplots(1, 2,figsize=(15,5)) #x = [[0.5,0.5],[0.6,0.4],[0.7,0.3],[0.8,0.2]] x = [0.5,0.6,0.7,0.8] ax[0].plot(x, last1, "r", marker='o', ms=10, label="整体") ax[0].plot(x, last2, "g", marker='*', ms=10, label="女士") ax[0].plot(x, last3, "b", marker='*', ms=10, label="男士") ax[1].plot(x, last4, "r", marker='o', ms=10, label="整体") ax[1].plot(x, last5, "g", marker='*', ms=10, label="女士") ax[1].plot(x, last6, "b", marker='*', ms=10, label="男士") for i in range(2): ax[i].set_ylim(0.3, 1.05) ax[i].set_xlabel('男生的先验概率') ax[i].set_ylabel('准确度') ax[i].legend() ax[0].set_title("Dataset测试结果图示") ax[1].set_title('Dataset3测试结果图示') plt.savefig("a.jpg") plt.show()
分析
随着先验概率的增加:
男性分类正确率呈上升趋势;
女性分类正确率呈下降趋势;
整体分类正确率在二者之间波动;
分析:
先验概率的选择对试验结果有一定的影响;
先验概率与数据集的整体分布有关
基于sklearn朴素贝叶斯模型
#建立模型 clf = GaussianNB() #使用训练集对模型进行训练 clf.fit(X_train,y_train) GaussianNB(priors=None) #使用测试集数据检验模型准确率 print("tsst_data1的准确率",clf.score(X_test1,y_test1)) print("tsst_data2的准确率",clf.score(X_test2,y_test2))
工作二
基于线性分类器进行性别分类,同样以dataset1作为训练样本,dataset2和dataset3作为测试样本。
基于线性判别分析
# -*- coding: utf-8 -*- """ Created on Fri Mar 13 12:02:21 2020 fisher 算法 @author: lihuanyu """ #%%LDA算法的实现 def Calculate_means_cov(X,y): n_class = [0,1] means = np.array([X[y == i].mean(axis=0) for i in n_class]) cov_girl = np.zeros((len(means[0]),len(means[0]))) cov_boy = np.zeros((len(means[0]),len(means[0]))) #计算类内散度矩阵 for i,j in zip(X,y): i = i.reshape(2,1) if j == 0: cov_girl += (i - means[0].reshape(2,1))*((i - means[0].reshape(2,1)).reshape(1,len(means[0]))) else: cov_boy += (i - means[0].reshape(2,1))*((i - means[0].reshape(2,1)).reshape(1,len(means[0]))) #类内散度相加 s_w = cov_boy + cov_girl #对其进行奇异值分解 u, s, v = np.linalg.svd(s_w) s_w_inv = np.dot(np.dot(v.T, np.linalg.inv(np.diag(s))), u.T) return(means,s_w_inv.dot((means[0]-means[1]).reshape(len(means[0]),1))) Calculate_means_cov(X_train,y_train) #%%算法的判别 mean,w = Calculate_means_cov(X_train,y_train) kernel_girl = np.dot(w.T,mean[0].reshape(len(mean[0]),1)) kernel_boy = np.dot(w.T,mean[1].reshape(len(mean[1]),1)) def Distinguish(X,y): #新样本进行判断 y_pre = [] for i in X: new_pos = np.dot(w.T,i.reshape(2,1)) if(abs(new_pos - kernel_girl) > abs(new_pos - kernel_boy)): y_pre.append(1) else: y_pre.append(0) print(accuracy_score(y_pre,y)) Distinguish(X_test1,y_test1) Distinguish(X_test2,y_test2) #%%绘制图像 import matplotlib.pyplot as plt from matplotlib.pylab import mpl from matplotlib.ticker import FuncFormatter plt.scatter(X_train[y_train==0,0],X_train[y_train==0,1],color='red') plt.scatter(X_train[y_train==1,0],X_train[y_train==1,1],color='blue') line_x = np.arange(150,190) line_y = -(w[0]*0.1*line_x) / w[1]*0.1 plt.plot(line_x, line_y, linewidth=3.0, label = 'fisher boundary line ') plt.legend(loc='upper right') plt.xlabel('feature 1') plt.ylabel('feature 2') plt.show()
0.8870967741935484
0.8777777777777778
分析
线性分类器的效果并不是很理想,我么可以利用SVM 分类器,等集成学习实现
工作三
以dataset1为训练数据库,进行基于pazen窗方法的概率密度估计,并进行0.5-0.5先验概率条件下的最小错误率并进行基于最小错误率的贝叶斯分类并以dataset2和dataset3为测试数据库分析分类性能
from sklearn.neighbors import KernelDensity import numpy as np from collections import Counter from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.metrics import accuracy_score #训练男士时的概率密度估计 kde_man = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(X_train[y_train==1]) #训练女士的概率密度估计 kde_woman = KernelDensity(kernel='gaussian', bandwidth=0.75).fit(X_train[y_train==0])
#对于test1 a = kde_woman.score_samples(X_test1)[:, np.newaxis] b = kde_man.score_samples(X_test1)[:, np.newaxis] c = np.exp(np.concatenate((a,b),axis=1)) probs = c * [0.5,0.5] prob_sum = probs.sum(axis =1) b = (probs/prob_sum[:,None]).argmax(axis=1) def get_acc(y_test,y_hat): print("test_data1准确度:",(y_hat == y_test).sum()/len(y_hat)) print("女生的精确度:",precision_score(y_test, y_hat, average=None)[0],"\n男生的精确度:",precision_score(y_test, y_hat, average=None)[1]) print("女生的召回率:",recall_score(y_test, y_hat, average=None)[0],"\n男生的召回率:",recall_score(y_test, y_hat, average=None)[1]) get_acc(y_test1,b)
test_data1准确度: 0.8548387096774194 女生的精确度: 0.7619047619047619 男生的精确度: 0.9024390243902439 女生的召回率: 0.8 男生的召回率: 0.8809523809523809
#对于test2 a = kde_woman.score_samples(X_test2)[:, np.newaxis] b = kde_man.score_samples(X_test2)[:, np.newaxis] c = np.exp(np.concatenate((a,b),axis=1)) probs = c * [0.5,0.5] prob_sum = probs.sum(axis =1) b = (probs/prob_sum[:,None]).argmax(axis=1) def get_acc(y_test,y_hat): print("test_data2准确度:",(y_hat == y_test).sum()/len(y_hat)) print("女生的精确度:",precision_score(y_test, y_hat, average=None)[0],"\n男生的精确度:",precision_score(y_test, y_hat, average=None)[1]) print("女生的召回率:",recall_score(y_test, y_hat, average=None)[0],"\n男生的召回率:",recall_score(y_test, y_hat, average=None)[1]) get_acc(y_test2,b)
分析
我们和核密度估计采用的是高斯核,带宽为0.75。在先验概率相同的情况下(均为0.5),我们发现从总体上而言test_data1和test_2的准确率都比较好,但是就各自的类别而言,男生的查准率和查全率都比较高,而女生的效果不是很理想,我认为这与先验概率有一定关系