user_artist__data数据记录用户播放某首歌曲的次数,数据包含3个字段,分别为useid(用户ID)、artistid(艺术家ID)、playcount(播放次数)。
拿到数据用excel拆分,数据分列
笑死,数据分开第三行居然不是数值型
数据改好了,可以这样试试
package thisterm; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; public class homework_4 { public static void main(String[] args) { // TODO Auto-generated method stub SparkSession spark = SparkSession.builder().master("local").appName("Java Spark SQL basic example").config("spark.testing.memory","2147480000").getOrCreate(); Dataset<Row> df = spark.read().format("csv").option("header", "true").load("file:///home/gyq/eclipse-workspace/user_artist_data.csv"); //df.show(); df.createOrReplaceTempView("UAD"); //1)统计非重复的用户个数。 //spark.sql("select count(distinct userid) as usernumber from UAD ").show(); //统计用户听过的歌曲总数。 //spark.sql("select userid,count(playcount) as playcount from UAD group by userid").show(); // 找出ID为“1000002”的用户最喜欢的10首歌曲(即播放次数最多的10首歌曲)。 spark.sql("select * from UAD where userid='1000002' order by playcount desc limit 20").show(); spark.stop(); } }
或者不改数据,直接用构造类
方法二
使用SparkSQL对该数据进行探索分析。
(1)统计非重复的用户个数。
(2)统计用户听过的歌曲总数。
(3)找出ID为“1000002”的用户最喜欢的10首歌曲(即播放次数最多的10首歌曲)。
package thisterm; import java.io.Serializable; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; public class addtableheader { public static class Person implements Serializable { private String userid; private String artistid; private int playcount; public String getUserid() {return userid; } public void setUserid(String userid) { this.userid = userid; } public String getArtistid() {return artistid;} public void setArtistid(String artistid) {this.artistid = artistid;} public int getPlaycount() {return playcount;} public void setPlaycount(int playcount) {this.playcount = playcount;} } public static void main(String[] args) { // TODO Auto-generated method stub SparkSession spark = SparkSession.builder().master("local").appName("Java Spark SQL basic example").config("spark.testing.memory","2147480000").getOrCreate(); JavaRDD<String> stringRDD = spark.read() .textFile("/home/gyq/eclipse-workspace/user_artist_data.csv") .javaRDD(); JavaRDD<Person> peopleRDD = stringRDD.map(line -> { String[] parts = line.split(" "); Person person = new Person(); person.setUserid(parts[0]); person.setArtistid(parts[1]); person.setPlaycount(Integer.parseInt(parts[2])); return person; }); Dataset<Row> peopleDF = spark.createDataFrame(peopleRDD, Person.class); //peopleDF.show(); peopleDF.createOrReplaceTempView("UAD"); //1)统计非重复的用户个数。 spark.sql("select count(distinct userid) as usernumber from UAD ").show(); //统计用户听过的歌曲总数。 //spark.sql("select userid,count(playcount) as playcount from UAD group by userid").show(); // 找出ID为“1000002”的用户最喜欢的10首歌曲(即播放次数最多的10首歌曲)。 //spark.sql("select * from UAD where userid='1000002' order by playcount desc limit 10").show(); } }