任务
一个在线商品购买记录数据集,约40M,格式如下:
Jack,iphone cover,9,99
Jack,iphone cover,9,99
Jack,iphone cover,9,99
Jack,iphone cover,9,99
完成统计:
1.购买总次数
2.客户总个数
3.总收入
4.最畅销的商品
代码
import java.util.Collections;
import java.util.Comparator;
import java.util.List;
import org.apache.commons.collections.comparators.ComparableComparator;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.DoubleFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import scala.Tuple2;
/**
*
* @author jinhang
*
*/
public class JavaApp {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setAppName("ShopInfoAnalysis").setMaster("local[*]");
JavaSparkContext sc = new JavaSparkContext(sparkConf);
JavaRDD<String[]> data = sc.textFile("data/UserPurchaseHistory.csv").map(s -> s.split(","));
/**
* 统计
*/
long numPurchases = data.count();
long uniqueUsers = data.map(s->s[0]).distinct().count();
double totalRevenue = data.mapToDouble(s -> Double.parseDouble(s[2])).sum();
JavaPairRDD<String, Integer> product = data.mapToPair(s->new Tuple2(s[1],1));
List<Tuple2<String, Integer>> pairs= product.reduceByKey((x,y)->(x+y)).sortByKey().collect();
System.out.println(pairs);
String mostPopular = pairs.get(pairs.size()-1)._1();
int purchases = pairs.get(0)._2();
System.out.println("Total purchases: " + numPurchases);
System.out.println("Unique users: " + uniqueUsers);
System.out.println("Total revenue: " + totalRevenue);
System.out.println(String.format("Most popular product: %s with %d purchases",
mostPopular, purchases));
sc.stop();
}
}
简单的RDD转换和执行就可以简单解决大数据的问题,java实现的代码方便和以前的hadoop代码结合执行。