样本数据如下(未截取完整,另一篇有生成数据的代码)
代码如下:
import java.util.Arrays; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaPairRDD; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import scala.Tuple2; public class test9 { public static void main(String[] args) { // TODO Auto-generated method stub SparkConf sparkConf = new SparkConf().setAppName("PeopleInfoCalculator").setMaster("local"); JavaSparkContext sc = new JavaSparkContext(sparkConf); JavaRDD<String> filedata=sc.textFile("file:///home/gyq/eclipse-workspace/ALS/leInfo.txt"); JavaRDD<String> Datardd=filedata.filter(f->{//过滤缺失的数据 String[] tok=f.split(" "); if(tok.length<3) return false; else return true; }); JavaRDD<String> manrdd=Datardd.filter(f->f.contains("M"));//过滤女性 JavaRDD<String> womanrdd=Datardd.filter(f->f.contains("F"));//过滤男性 JavaRDD<String> manrdd1=manrdd.flatMap(f->Arrays.asList(f.split(" ")[2]).iterator());//取男性字符 JavaRDD<String> womanrdd1=womanrdd.flatMap(f->Arrays.asList(f.split(" ")[2]).iterator());//取女性字符 JavaRDD<Integer> manrdd2 = manrdd1.map(f-> Integer.valueOf(f));//男性数值 JavaRDD<Integer> womanrdd2 = womanrdd1.map(f->Integer.parseInt(f)); //女性数值 JavaRDD<Integer> manhigh= manrdd2.sortBy(f->f,false,3);//男性排序高到底 JavaRDD<Integer> manlow = manrdd2.sortBy(f->f,true,3);//男性排序底到高 int manhighestconsume=manhigh.first();//男人最高消费 int manlowestconsume=manlow.first();//男人最低消费 JavaRDD<Integer> womanhigh = womanrdd2.sortBy(f->f,false,3);//女性排序高到底 JavaRDD<Integer> womanlow = womanrdd2.sortBy(f->f,true,3);//女性排序底到高 int womanhighestconsume= womanhigh.first();//女人最高消费 int womanlowestconsume=womanlow.first();//女人最低消费 System.out.println("男人最高消费"+manhighestconsume); System.out.println("男人最低消费"+manlowestconsume); System.out.println("女人最高消费"+womanhighestconsume); System.out.println("女人最低消费"+womanlowestconsume); sc.close(); } }
结果如下: