训练代码(scala)
import org.apache.spark.mllib.classification.{NaiveBayes,NaiveBayesModel}
import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.{SparkContext,SparkConf} object NaiveBayes { def main(args: Array[String]): Unit = { val conf = new SparkConf() .setMaster("local") .setAppName("NaiveBayes") val sc = new SparkContext(conf) val path = "../data/sample_football_weather.txt" val data = sc.textFile(path) val parsedData =data.map { line => val parts =line.split(',') LabeledPoint(parts(0).toDouble,Vectors.dense(parts(1).split(' ').map(_.toDouble))) } //样本划分train和test数据样本60%用于train val splits = parsedData.randomSplit(Array(0.6,0.4),seed = 11L) val training =splits(0) val test =splits(1) //获得训练模型,第一个参数为数据,第二个参数为平滑参数,默认为1,可改变 val model =NaiveBayes.train(training,lambda = 1.0) //对测试样本进行测试 //对模型进行准确度分析 val predictionAndLabel= test.map(p => (model.predict(p.features),p.label)) val accuracy =1.0 *predictionAndLabel.filter(x => x._1 == x._2).count() / test.count() //打印一个预测值 println("NaiveBayes精度----->" + accuracy) //我们这里特地打印一个预测值:假如一天是 晴天(0)凉(2)高(0)高(1) 踢球与否 println("假如一天是 晴天(0)凉(2)高(0)高(1) 踢球与否:" + model.predict(Vectors.dense(0.0,2.0,0.0,1.0))) //保存model val ModelPath = "../model/NaiveBayes_model.obj" model.save(sc,ModelPath) //val testmodel = NaiveBayesModel.load(sc,ModelPath) } }
NaiveBayes
类的分布估计调整为
多项式模型下的参数估计调整为:
伯努力模型下参数估计调整为:
拉普拉斯平滑
也就是代码中的NaiveBayes.train(training,lambda = 1.0)
本文转自张昺华-sky博客园博客,原文链接:http://www.cnblogs.com/bonelee/p/7841678.html,如需转载请自行联系原作者