Hadoop之MapReduce_WordCount案例

  • 基于IntelliJ IDEA开发工具,简单配置如下:
    1.点击File->New->Project,在弹出的对话框中选择Maven,JDK选择你自己安装的版本,点击Next


    B115D533-4D48-4B3C-B1B3-5DB8BE627299.png

2.填写Maven的GroupId和ArtifactId

3.打开Intellij的Preference偏好设置,定位到Build, Execution, Deployment->Compiler->Java Compiler,将WordCount的Target bytecode version修改为你的jdk版本(我的是1.8)


01EB98A1-C0BD-4611-B603-8038077A62B9.png

4.配置依赖
找到pom.xml配置文件,配置如下:

  <?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>
    <groupId>hadoop</groupId>
    <artifactId>com.hadoop</artifactId>
    <version>1.0-SNAPSHOT</version>

    <repositories>
        <repository>
            <id>apache</id>
            <url>http://maven.apache.org</url>
        </repository>
    </repositories>

    <dependencies>
        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-core</artifactId>
            <version>1.2.1</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.7.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-client</artifactId>
            <version>2.7.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-hdfs</artifactId>
            <version>2.7.2</version>
        </dependency>

        <dependency>
            <groupId>org.apache.logging.log4j</groupId>
            <artifactId>log4j-core</artifactId>
            <version>2.8.2</version>
        </dependency>

        <dependency>
            <groupId>junit</groupId>
            <artifactId>junit</artifactId>
            <version>RELEASE</version>
        </dependency>

    </dependencies>

    <build>
        <plugins>
            <plugin>
                <artifactId>maven-dependency-plugin</artifactId>
                <configuration>
                    <excludeTransitive>false</excludeTransitive>
                    <stripVersion>true</stripVersion>
                    <outputDirectory>./lib</outputDirectory>
                </configuration>
            </plugin>
        </plugins>
    </build>
</project>

5.在 src/main/resources 目录下,创建一个文件: log4j.properties

log4j.rootLogger=INFO, stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n
log4j.appender.logfile=org.apache.log4j.FileAppender
log4j.appender.logfile.File=target/spring.log
log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
log4j.appender.logfile.layout.ConversionPattern=%d %p [%c] - %m%n

6.编写程序代码
6.1> 项目的目录结构如图:


51DB39BC-13B1-49C9-BB61-8E667DC41B09.png

6.2> WordCountMapper文件代码如下:

package come.hadoop.mr.wordcount;

import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;


//map阶段
//KEYIN: 输入数据的类型
//VALUEIN: 输入的数据value
//KEYOUT: 输出数据的key类型
//VALUEOUT: 输出的数据的value类型
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
    Text k = new Text();
    IntWritable v =  new IntWritable(1);

    @Override
    protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
//        super.map(key, value, context);
        //获取一行(一次读取一行)
        // 奥巴马  习近平
        String line = value.toString();

        //2.切割单词(按照空格切割)
        String[] words = line.split(" ");

        //3循环写出
        for (String word: words){
            //以单词为key,单词出现的次数为value
            k.set(word);
            context.write(k, v);
        }
    }
}

6.3>WordCountReducer文件代码如下:

package come.hadoop.mr.wordcount;


//import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.io.Text;

import java.io.IOException;

public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {

    IntWritable v = new IntWritable();

    @Override
    protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
        //奥巴马, 1
        //习近平, 1
        //1.累加求和
        int sum = 0;
        for (IntWritable value: values) {
            sum += value.get();
        }
        v.set(sum);
        //2.输出 atguigu 2
        context.write(key, v);
    }
}

6.4>WordCountDriver文件代码如下:

package come.hadoop.mr.wordcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;

public class WordCountDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        //1.获取Job对象
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        //2.设置jar存储位置
        job.setJarByClass(WordCountDriver.class);

        //3.关联Map和Reduce类
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);

        //4.设置Mapper阶段输出数据的key和value类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);

        //5.设置最终数据输出的key和value类型
        job.setOutputKeyClass(Text.class);  //理解为WordCountReducer的输出
        job.setOutputValueClass(IntWritable.class);

        //6.设置输入路径和输出路径
        FileInputFormat.setInputPaths(job, new Path(args[0]));  //第一个参数就传输入路径
        FileOutputFormat.setOutputPath(job, new Path(args[1])); //第二个参数就传输出路径

        //7.提交job
//        job.submit();
        boolean result = job.waitForCompletion(true);
        System.exit(result ? 0 : 1);
    }
}

7.添加文件内容(/Users/XXX/Desktop/word/wordcount.txt):

wudy peter
sunny timo peter
张三 wudy
奥巴马 习近平
peter jack

8.在Application配置文件中传入输入文件路径输出文件路径这两个参数给到args
注意:输出的文件夹(output)一开始应该是不存在的

1B320D8F-7E9C-48D9-BAA4-5930B182B725.png

9.运行WordCountDriver, 输出结果为:

jack    1
peter   3
sunny   1
timo    1
wudy    2
习近平 1
奥巴马 1
张三  1

10.打包成jar文件
View -> Tool Windows -> Maven Projects, 出现以下界面:

52CF4F5D-CD56-40CF-8EF5-1F76F3363A38.png

11.打包完成后,在我们的项目下面会出现如下图,选择没有依赖的包(重命名为wc.jar)上传到服务器集群


0B1779D3-BB10-4132-AFE1-CE6836D3B109.png
A9CF3514-55BB-4385-A1A5-21123C9B6065.png

12.在集群上运行我们测试的jar包
运行命令:hadoop wc.jar come.hadoop.mr.wordcount.WordCountDriver /user/wudy/input /user/wudy/output

如下图:


146A7532-3EAE-45B5-A66E-EBC5DF54071A.png

注意:

  1. come.hadoop.mr.wordcount.WordCountDriver 表示我们 主类的全类名
  2. /user/wudy/input表示我们hdfs的输入路径,如果不存在该目录,可通过下面命令创建 bin/hdfs dfs -mkdir -p /user/wudy/input

3./user/wudy/output表示我们hdfs的输出路径(如果该路径已经存在,应该先删除)

13.运行完毕,可以在 /user/wudy/output目录看到最终生成的文件


68E05B14-1FB3-4F1D-AC3D-0FF43D73B727.png
14C863AE-B98D-42D0-BD7C-E99AEDEAC1B5.png

点击下载,就能看到我们的计算结果了

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容