主程序代码如下:
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
//todo:利用scala语言开发spark的wordcount程序(本地运行)
object WordCount {
def main(args: Array[String]): Unit = {
//1、创建SparkConf对象 设置applicationName和master地址 local[2]表示本地采用2个线程
val sparkConf: SparkConf = new SparkConf().setAppName("WordCount").setMaster("local[2]")
//2、创建SparkContext对象,它是所有spark程序执行入口,它内部会构建DAGScheduler和TaskScheduler
val sc = new SparkContext(sparkConf)
//设置日志输出级别
sc.setLogLevel("warn")
//3、读取文件数据
val data: RDD[String] = sc.textFile("E:\\words.txt")
//4、切分每一行,获取所有的单词
val words: RDD[String] = data.flatMap(x=>x.split(" "))
//5、每个单词计为1
val wordAndOne: RDD[(String, Int)] = words.map(x =>(x,1))
//6、相同单词出现的1累加
val result: RDD[(String, Int)] = wordAndOne.reduceByKey((x:Int,y:Int)=>x+y)
//按照单词出现的次数降序排列 默认第二个参数是true,在这里表示升序,改为false就是降序
val sortedRDD: RDD[(String, Int)] = result.sortBy(x => x._2,false)
//7、收集数据打印
val finalResult: Array[(String, Int)] = sortedRDD.collect
finalResult.foreach(x=>println(x))
//8、关闭sc
sc.stop()
}
}
pom.xml文件:
<properties>
<!--<scala.version>2.11.8</scala.version>-->
<hadoop.version>3.1.0</hadoop.version>
<!--<spark.version>2.1.3</spark.version>-->
</properties>
<dependencies>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.12.7</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.12</artifactId>
<version>2.4.0</version>
</dependency>
</dependencies>
<build>
<sourceDirectory>src/main/scala</sourceDirectory>
<testSourceDirectory>src/test/scala</testSourceDirectory>
<plugins>
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>3.2.2</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
<goal>testCompile</goal>
</goals>
<configuration>
<args>
<arg>-dependencyfile</arg>
<arg>${project.build.directory}/.scala_dependencies</arg>
</args>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<version>2.4.3</version>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
</excludes>
</filter>
</filters>
<transformers>
<transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer">
<mainClass></mainClass>
</transformer>
</transformers>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>
</project>
程序启动后报错内容如下: