对电影评分数据进行分析,使用SQL编程,获取电影平均分Top10,要求电影的评分次数大于200
数据展示
代码
package org.example.SQL import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} //电影数据分析 object sql_Movie { def main(args: Array[String]): Unit = { Logger.getLogger("org").setLevel(Level.ERROR) val spark: SparkSession = SparkSession.builder().appName("sparksql").master("local[*]") .config("spark.sql.shuffle.partitions", 4) .getOrCreate() import spark.implicits._ val ds: Dataset[String] = spark.read.textFile("data/text/rating_100k.data") // ds.printSchema() // ds.show() val movies: DataFrame = ds.map(line => { val arr: Array[String] = line.split("\t") (arr(1), arr(2).toInt) }).toDF("movieid", "score") movies.printSchema() movies.show() //统计 评分次数>200的电影平均分Top10 movies.createOrReplaceTempView("movie") var sql = """ select movieid,count(*) as counts,avg(score) as avgs from movie group by movieid having counts >200 order by avgs limit 10 """.stripMargin spark.sql(sql).show() } }
数据打印
+-------+-----+ |movieid|score| +-------+-----+ | 242| 3| | 302| 3| | 377| 1| | 51| 2| | 346| 1| | 474| 4| | 265| 2| | 465| 5| | 451| 3| | 86| 3| | 257| 2| | 1014| 5| | 222| 5| | 40| 3| | 29| 3| | 785| 3| | 387| 5| | 274| 2| | 1042| 4| | 1184| 2| +-------+-----+
结果输出
+-------+------+------------------+ |movieid|counts| avgs| +-------+------+------------------+ | 678| 219| 2.808219178082192| | 235| 217| 2.847926267281106| | 323| 240| 2.933333333333333| | 289| 259|2.9806949806949805| | 546| 254| 3.031496062992126| | 322| 218|3.0871559633027523| | 245| 240|3.1083333333333334| | 748| 316|3.1234177215189876| | 597| 206| 3.150485436893204| | 294| 485| 3.156701030927835| +-------+------+------------------+