数据集
- Dataset1.txt
328 个同学的身高、体重、性别数据(78 个女生、250 个男生) - Dataset2.txt
124 个同学的数据(40 女、84 男) - Dataset3.txt
90 个同学的数据(16 女,74 男)
百度网盘 提取码:8plu
工作一
以dataset1为训练数据库,假设身高与体重满足高斯分布,
进行高斯分布的参数估计,
并进行基于最小错误率的贝叶斯分类,分别考虑男女的先验概率,0.5-0.5;0.6-0.4;0.7-0.3,0.8-0.2,
并以dataset2和dataset3为测试数据库分析分类性能,
并探讨先验概率对分类性能的影响。
导入数据
import numpy as np import pandas as pd from sklearn.naive_bayes import GaussianNB from sklearn.model_selection import train_test_split import numpy as np import csv p = r'train_data.csv' with open(p,encoding = 'utf-8') as f: train_data = np.loadtxt(f,str,delimiter = ",") print("打印训练集前5个数据集:\n",train_data[:5]) p = r'test_data1.csv' with open(p,encoding = 'utf-8') as f: test_data1 = np.loadtxt(f,str,delimiter = ",") print("打印测试集1前5个数据集:\n",test_data1[:5]) p = r'test_data2.csv' with open(p,encoding = 'utf-8') as f: test_data2 = np.loadtxt(f,str,delimiter = ",") print("打印测试集1前5个数据集:\n",test_data2[:5]) #将数据集拆分为特征向量和标签值 X_train,X_test1,X_test2 = train_data[:,:2],test_data1[:,:2],test_data2[:,:2] y_train,y_test1,y_test2 = train_data[:,2],test_data1[:,2],test_data2[:,2] #查看训练集维度 print("查看X_train特征维度:",X_train.shape,"\n查看y_train特征维度:",y_train.shape) print("查看X_test1特征维度:",X_test1.shape,"\n查看y_test1特征维度:",y_test1.shape) print("查看X_test2特征维度:",X_test2.shape,"\n查看X_test2特征维度:",y_test2.shape) #将字符型准换成浮点型 X_train = X_train.astype(float) X_test1 = X_test1.astype(float) X_test2 = X_test2.astype(float) #将"M","F"转化为0,1 y_train = np.array([1 if i=='M' else 0 for i in y_train]) y_test1 = np.array([1 if i=='M' else 0 for i in y_test1]) y_test2 = np.array([1 if i=='M' else 0 for i in y_test2]) import matplotlib.pyplot as plt from matplotlib.pylab import mpl from matplotlib.ticker import FuncFormatter mpl.rcParams['font.sans-serif'] = ['SimHei'] mpl.rcParams['axes.unicode_minus']=False fig, ax = plt.subplots(3,1,figsize=(9,25)) ax[0].scatter(X_train[y_train==1,0],X_train[y_train==1,1], marker="o",c="green",label="男士",alpha=1) ax[0].scatter(X_train[y_train==0,0],X_train[y_train==0,1], marker="x",c="red",label="女士",alpha=1) ax[1].scatter(X_test1[y_test1==1,0],X_test1[y_test1==1,1], marker="o",c="green",label="男士",alpha=1) ax[1].scatter(X_test1[y_test1==0,0],X_test1[y_test1==0,1], marker="x",c="red",label="女士",alpha=1) ax[2].scatter(X_test2[y_test2==1,0],X_test2[y_test2==1,1], marker="o",c="green",label="男士",alpha=1) ax[2].scatter(X_test2[y_test2==0,0],X_test2[y_test2==0,1], marker="x",c="red",label="女士",alpha=1) for i in range(3): ax[i].set_xlim(140, 200) ax[i].set_ylim(35, 110) ax[i].set_xlabel('体重') ax[i].set_ylabel('身高') ax[i].legend() ax[0].set_title('测试集的散点分布') ax[1].set_title('test_data1的散点分布') ax[2].set_title('test_data2的散点分布') plt.show()
打印训练集前5个数据集: [['161' '46' 'F'] ['160' '56' 'F'] ['163' '50' 'F'] ['169' '54' 'F'] ['160' '48' 'F']] 打印测试集1前5个数据集: [['151' '42' 'F'] ['153' '48' 'F'] ['155' '43' 'F'] ['158' '49' 'F'] ['158' '58' 'F']] 打印测试集1前5个数据集: [['163' '48' 'F'] ['169' '50' 'F'] ['170' '50' 'F'] ['167' '50' 'F'] ['167' '55' 'F']] 查看X_train特征维度: (328, 2) 查看y_train特征维度: (328,) 查看X_test1特征维度: (124, 2) 查看y_test1特征维度: (124,) 查看X_test2特征维度: (90, 2) 查看X_test2特征维度: (90,)