1.新建文件夹
hadoop fs -mkdir /input
2.上传文件 /opt/hadoop-2.9.2/LICENSE.txt
hadoop fs -put LICENSE.txt /input
3.运行程序 /opt/hadoop-2.9.2/share/hadoop/mapreduce
注意:output目录不能存在,hadoop会自己建立这个目录,这是hadoop内部的一个机制,如果有这个目录,程序无法执行
hadoop jar hadoop-mapreduce-examples-2.9.2.jar wordcount /input /output
4.运行状态
5.查看结果 hadoop fs -cat /output/part-r-00000
6.源代码分析
package com.xq.wordcount; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class WordCount { //自定义的mapper,继承org.apache.hadoop.mapreduce.Mapper public static class MyMapper extends org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, Text, LongWritable>{ @Override protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, LongWritable>.Context context) throws IOException, InterruptedException { String line = value.toString();//split 函数是用于按指定字符(串)或正则去分割某个字符串,结果以字符串数组形式返回,这里按照"\t"来分割text文件中字符,即一个制表符,这就是为什么我在文本中用了空格分割,导致最后的结果有很大的出入。 String[] splited = line.split(" "); //foreach 就是 for(元素类型t 元素变量x:遍历对象obj){引用x的java语句} for (String word : splited) { context.write(new Text(word), new LongWritable(1)); } } } public static class MyReducer extends org.apache.hadoop.mapreduce.Reducer<Text, LongWritable, Text, LongWritable>{ @Override protected void reduce(Text k2, Iterable<LongWritable> v2s, Reducer<Text, LongWritable, Text, LongWritable>.Context context) throws IOException, InterruptedException { long count = 0L; for (LongWritable v2 : v2s) { count += v2.get(); } LongWritable v3 = new LongWritable(count); context.write(k2, v3); } } //客户端代码,写完交给ResourceManager框架去执行 public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf, WordCount.class.getSimpleName()); //打成jar执行 job.setJarByClass(WordCount.class); //数据在哪里? FileInputFormat.setInputPaths(job, args[0]); //使用哪个mapper处理输入的数据? job.setMapperClass(MyMapper.class); //map输出的数据类型是什么? job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(LongWritable.class); job.setCombinerClass(MyReducer.class);//combine //使用哪个reducer处理输入的数据? job.setReducerClass(MyReducer.class); //reduce输出的数据类型是什么? job.setOutputKeyClass(Text.class); job.setOutputValueClass(LongWritable.class); //数据输出到哪里? FileOutputFormat.setOutputPath(job, new Path(args[1])); //交给yarn去执行,直到执行结束才退出本程序 job.waitForCompletion(true); } }
pom文件
<properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> <maven.compiler.source>1.8</maven.compiler.source> <maven.compiler.target>1.8</maven.compiler.target> </properties> <build> <pluginManagement> <plugins> <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-jar-plugin</artifactId> <configuration> <archive> <manifest> <mainClass>com.xq.wordcount.WordCount</mainClass> <addClasspath>true</addClasspath> <classpathPrefix>lib/</classpathPrefix> </manifest> </archive> <classesDirectory> </classesDirectory> </configuration> </plugin> </plugins> </pluginManagement> </build> <dependencies> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>3.8.1</version> <scope>test</scope> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-common</artifactId> <version>2.6.0</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-client</artifactId> <version>2.6.0</version> </dependency> <dependency> <groupId>org.apache.hadoop</groupId> <artifactId>hadoop-hdfs</artifactId> <version>2.6.0</version> </dependency> <dependency> <groupId>log4j</groupId> <artifactId>log4j</artifactId> <version>1.2.17</version> </dependency> </dependencies>