前言
学习任何一门语言,都是从helloword开始,对于大数据框架来说,则是从wordcount开始,Spark也不例外,作为一门大数据处理框架,在系统的学习spark之后,wordcount可以有11种方式实现,你知道的有哪些呢?还等啥,不知道的来了解一下吧!
11种方式实现wordcount
方式1:groupBy
def wordcount(sc: SparkContext): Unit ={ val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark")) val mapRDD = rdd.flatMap(_.split(" ")) val mRDD = mapRDD.map((_, 1)) val group = mRDD.groupBy(t => t._1) val wordcount = group.mapValues( iter => iter.size ) }
方式2:groupByKey
def wordcount2(sc: SparkContext): Unit ={ val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark")) val mapRDD = rdd.flatMap(_.split(" ")) val mRDD = mapRDD.map((_, 1)) val group = mRDD.groupByKey() val wordcount = group.mapValues( iter => iter.size ) }
方式3:reduceByKey
def wordcount3(sc: SparkContext): Unit ={ val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark")) val mapRDD = rdd.flatMap(_.split(" ")) val mRDD = mapRDD.map((_, 1)) val wordcount = mRDD.reduceByKey(_+_) }
方式4:aggregateByKey
def wordcount4(sc: SparkContext): Unit ={ val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark")) val mapRDD = rdd.flatMap(_.split(" ")) val mRDD = mapRDD.map((_, 1)) val wordcount = mRDD.aggregateByKey(0)(_+_,_+_) }
方式5:foldByKey
def wordcount5(sc: SparkContext): Unit ={ val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark")) val mapRDD = rdd.flatMap(_.split(" ")) val mRDD = mapRDD.map((_, 1)) val wordcount = mRDD.foldByKey(0)(_+_) }
方式6:combineByKey
def wordcount6(sc: SparkContext): Unit ={ val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark")) val mapRDD = rdd.flatMap(_.split(" ")) val mRDD = mapRDD.map((_, 1)) val wordcount = mRDD.combineByKey( v => v, (x:Int, y) => x + y, (x:Int, y:Int) => x + y ) }
方式7:countByKey
def wordcount7(sc: SparkContext): Unit ={ val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark")) val mapRDD = rdd.flatMap(_.split(" ")) val mRDD = mapRDD.map((_, 1)) val wordcount = mRDD.countByKey() }
方式8:countByValue
def wordcount8(sc: SparkContext): Unit ={ val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark")) val mapRDD = rdd.flatMap(_.split(" ")) val wordcount = mapRDD.countByValue() }
方式9:reduce
def wordcount9(sc: SparkContext): Unit ={ val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark")) val words = rdd.flatMap(_.split(" ")) // word => Map[(word,1)] val mapRDD = words.map( word => { // mutable 操作起来比较方便 mutable.Map[String, Long]((word, 1)) } ) // Map 和 Map 聚合 val wordcount = mapRDD.reduce( (map1, map2) => { map2.foreach { case (word, count) => { val newCount = map1.getOrElse(word, 0L) + count map1.update(word, newCount) } } map1 } ) println(wordcount) }
方式10:aggregate
def wordcount10(sc: SparkContext): Unit ={ val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark")) val words = rdd.flatMap(_.split(" ")) // word => Map[(word,1)] val mapRDD = words.map( word => { // mutable 操作起来比较方便 mutable.Map[String, Long]((word, 1)) } ) // Map 和 Map 聚合 val wordcount = mapRDD.aggregate(mutable.Map[String, Long]((mapRDD.first().keySet.head,0)))( (map1, map2) => { map2.foreach { case (word, count) => { val newCount = map1.getOrElse(word, 0L) + count map1.update(word, newCount) } } map1 }, (map1, map2) => { map2.foreach { case (word, count) => { val newCount = map1.getOrElse(word, 0L) + count map1.update(word, newCount) } } map1 } ) println(wordcount) }
方式11:fold
def wordcount11(sc: SparkContext): Unit ={ val rdd = sc.makeRDD(List("Hello Scala", "Hello Spark")) val words = rdd.flatMap(_.split(" ")) // word => Map[(word,1)] val mapRDD = words.map( word => { // mutable 操作起来比较方便 mutable.Map[String, Long]((word, 1)) } ) // Map 和 Map 聚合 val wordcount = mapRDD.fold(mutable.Map[String, Long]((mapRDD.first().keySet.head,0)))( (map1, map2) => { map2.foreach { case (word, count) => { val newCount = map1.getOrElse(word, 0L) + count map1.update(word, newCount) } } map1 } ) println(wordcount) }
结语
以上就是十一种求WordCount的实现方式了!好了,今天就为大家分享到这里了,如果本文对你有帮助的话,欢迎点赞&收藏&分享,这对我继续分享&创作优质文章非常重要。感谢🙏🏻