之前的一篇文章Eclipse编写MapReduce程序,其实执行是在local。
网上查找资料如何在java web项目内提交job到服务端执行MapReduce的教程,奈何参考网上的资料却无法成功执行,尝试了各种错误,终于成功,故有此文,以作记录。
1.创建一个普通的java web项目 命名为 WordCountPage
本文是以wordcount为例,仅作一个抛砖引玉作用。
2代码
TW3.java
//TW3.java
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class TWC3 {
public static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String valueString = value.toString();
StringTokenizer itr = new StringTokenizer(valueString);
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(TWC3.class);
job.setJar(args[2]);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
Path output = new Path(args[1]);
FileSystem fs = FileSystem.get(conf);
if (fs.exists(output)) {
fs.delete(output, true);
}
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// System.exit(job.waitForCompletion(true)?0:1);
job.waitForCompletion(true);
}
}
WCServlet.java
//WCServlet.java
import java.io.IOException;
import java.util.Calendar;
import javax.servlet.ServletException;
import javax.servlet.annotation.WebServlet;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
/**
* Servlet implementation class WCServlet
*/
@WebServlet("/WCServlet")
public class WCServlet extends HttpServlet {
private static final long serialVersionUID = 1L;
/**
* @see HttpServlet#HttpServlet()
*/
public WCServlet() {
super();
}
/**
* @see HttpServlet#doGet(HttpServletRequest request, HttpServletResponse response)
*/
protected void doGet(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
response.getWriter().append("Served at: ").append(request.getContextPath());
String[] args=new String[3];
Calendar now = Calendar.getInstance();
args[0]="hdfs://master:9000/input/";
args[1]="hdfs://master:9000/output/"+ now.getTimeInMillis();
String a = System.getProperty("catalina.home") +"/lib/userlib/TWC2-3.jar";
// System.out.println("========a======="+a );
response.getWriter().append("Served at: " + a);
args[2] = a;
try {
TWC3.main(args);
} catch (ClassNotFoundException e) {
e.printStackTrace();
} catch (InterruptedException e) {
e.printStackTrace();
}
}
/**
* @see HttpServlet#doPost(HttpServletRequest request, HttpServletResponse response)
*/
protected void doPost(HttpServletRequest request, HttpServletResponse response) throws ServletException, IOException {
// TODO Auto-generated method stub
doGet(request, response);
}
}
index.jsp
<%@ page language="java" contentType="text/html; charset=ISO-8859-1"
pageEncoding="ISO-8859-1"%>
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=ISO-8859-1">
<title>Insert title here</title>
</head>
<body>
<form action="WCServlet" method="post">
<input type="text" name="keyword"> <br>
<input type="submit" value="Submit">
</form>
<hr>
</body>
</html>
简单代码解释:
index.jsp
1.里面一个表单,点击提交后,会到WCServlet处理相关业务。
WCServlet.java
1.doPost() 函数 调用 doGet()函数。
2.创建一个数组String[] args ,并设置值。args[0]="hdfs://master:9000/input/";为MR程序要处理的数据源地址。args[1]="hdfs://master:9000/output/"+ now.getTimeInMillis(); 为结果保存指定文件夹。
3.由于Hadoop服务器上并没有你的MR程序,你需要提前上传。
String a = System.getProperty("catalina.home") +"/lib/userlib/TWC2-3.jar"; 为MR调用的jar文件路径。
args[2] = a; 把该值存在数组,以被调用。
4.TWC3.main(args); 调用 TWC3 该类的main函数,并且把数组作为参数传递过去。
TWC3.java
1.MyMapper 和 MyReducer 分别实现map 和 reduce。
2.main()函数类配置相关信息
3.job.setJar(args[2]); 这个设置指定执行的jar文件,上面提到,服务器端并没有你自己实现的MR程序,所以得手动打包 成jar文件,提前上传。
3.给Eclipse 添加相关的文件
这几个文件可以从hadoop 拷下来(filezilla / xftp等软件) 并作一些简单修改。
core-site.xml
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>fs.default.name</name>
<value>hdfs://master:9000</value>
</property>
<property>
<name>hadoop.tmp.dir</name>
<value>file:/home/ubuntu/developer/hadoop-2.7.3/tmp</value>
</property>
<property>
<name>io.file.buffer.size</name>
<value>131702</value>
</property>
</configuration>
hdfs-site.xml
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:/home/ubuntu/developer/hadoop-2.7.3/hdfs/name</value>
</property>
<property>
<name>dfs.datanode.data.dir</name>
<value>file:/home/ubuntu/developer/hadoop-2.7.3/hdfs/data</value>
</property>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
<property>
<name>dfs.namenode.secondary.http-address</name>
<value>master:9001</value>
</property>
<property>
<name>dfs.webhdfs.enabled</name>
<value>true</value>
</property>
<property>
<name>dfs.namenode.datanode.registration.ip-hostname-check</name>
<value>false</value>
</property>
<property>
<name>dfs.permissions</name>
<value>false</value>
</property>
</configuration>
mapred.site.xml
<!-- Put site-specific property overrides in this file. -->
<configuration>
<property>
<name>mapred.remote.os</name>
<value>Linux</value>
</property>
<property>
<name>mapreduce.app-submission.cross-platform</name>
<value>true</value>
</property>
<property>
<name>mapreduce.application.classpath</name>
<value>/home/ubuntu/developer/hadoop-2.7.3/etc/hadoop,
/home/ubuntu/developer/hadoop-2.7.3/share/hadoop/common/*,
/home/ubuntu/developer/hadoop-2.7.3/share/hadoop/common/lib/*,
/home/ubuntu/developer/hadoop-2.7.3/share/hadoop/hdfs/*,
/home/ubuntu/developer/hadoop-2.7.3/share/hadoop/hdfs/lib/*,
/home/ubuntu/developer/hadoop-2.7.3/share/hadoop/mapreduce/*,
/home/ubuntu/developer/hadoop-2.7.3/share/hadoop/mapreduce/lib/*,
/home/ubuntu/developer/hadoop-2.7.3/share/hadoop/yarn/*,
/home/ubuntu/developer/hadoop-2.7.3/share/hadoop/yarn/lib/*
</value>
</property>
<!-- =========================================== -->
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapreduce.jobhistory.address</name>
<value>master:10020</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>master:19888</value>
</property>
<property>
<name>mapreduce.map.memory.mb</name>
<value>1024</value>
</property>
<property>
<name>mapreduce.reduce.memory.mb</name>
<value>1024</value>
</property>
<property>
<name>mapreduce.map.java.opts</name>
<value>-Xmx512m</value>
</property>
<property>
<name>mapreduce.reduce.java.opts</name>
<value>-Xmx512m</value>
</property>
</configuration>
yarn-site.xml
<configuration>
<!-- Site specific YARN configuration properties -->
<property>
<name>yarn.application.classpath</name>
<value>/home/ubuntu/developer/hadoop-2.7.3/etc/hadoop,
/home/ubuntu/developer/hadoop-2.7.3/share/hadoop/common/*,
/home/ubuntu/developer/hadoop-2.7.3/share/hadoop/common/lib/*,
/home/ubuntu/developer/hadoop-2.7.3/share/hadoop/hdfs/*,
/home/ubuntu/developer/hadoop-2.7.3/share/hadoop/hdfs/lib/*,
/home/ubuntu/developer/hadoop-2.7.3/share/hadoop/mapreduce/*,
/home/ubuntu/developer/hadoop-2.7.3/share/hadoop/mapreduce/lib/*,
/home/ubuntu/developer/hadoop-2.7.3/share/hadoop/yarn/*,
/home/ubuntu/developer/hadoop-2.7.3/share/hadoop/yarn/lib/*</value>
</property>
<!-- Site specific YARN configuration properties -->
<property>
<description>The hostname of the RM.</description>
<name>yarn.resourcemanager.hostname</name>
<value>master</value>
</property>
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.auxservices.mapreduce.shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
<property>
<name>yarn.resourcemanager.address</name>
<value>${yarn.resourcemanager.hostname}:8032</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address</name>
<value>${yarn.resourcemanager.hostname}:8030</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address</name>
<value>${yarn.resourcemanager.hostname}:8031</value>
</property>
<property>
<name>yarn.resourcemanager.admin.address</name>
<value>${yarn.resourcemanager.hostname}:8033</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address</name>
<value>${yarn.resourcemanager.hostname}:8088</value>
</property>
<property>
<description>The https adddress of the RM web application.</description>
<name>yarn.resourcemanager.webapp.https.address</name>
<value>${yarn.resourcemanager.hostname}:8090</value>
</property>
<property>
<description>List of directories to store localized files in. An
application's localized file directory will be found in:
${yarn.nodemanager.local-dirs}/usercache/${user}/appcache/application_${appid}.
Individual containers' work directories, called container_${contid}, will
be subdirectories of this.
</description>
<name>yarn.nodemanager.local-dirs</name>
<value>/data/hadoop/yarn/local</value>
</property>
<property>
<description>Whether to enable log aggregation</description>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<property>
<description>Where to aggregate logs to.</description>
<name>yarn.nodemanager.remote-app-log-dir</name>
<value>/data/tmp/logs</value>
</property>
<property>
<description>Amount of physical memory, in MB, that can be allocated
for containers.</description>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>2048</value>
</property>
<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>512</value>
</property>
<property>
<name>yarn.nodemanager.vmem-pmem-ratio</name>
<value>1.0</value>
</property>
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
</property>
</configuration>
log4j.properties
log4j.rootLogger=INFO, stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n
log4j.appender.logfile=org.apache.log4j.FileAppender
log4j.appender.logfile.File=target/spring.log
log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
log4j.appender.logfile.layout.ConversionPattern=%d %p [%c] - %m%n
4.导出TWC3.java 的 jar 文件
右键TWC3.java 文件 -- Export... -- Runnable JAR file
指定位置保存
5.导出项目WordCountPage 为 war 文件
右键项目 WordCountPage -- Export -- WAR file
6.上传文件到服务器
1.把TWC2-3.jar 上传到服务器端/home/ubuntu/developer/apache-tomcat-8.5.14/lib/userlib下
2.把项目WordCountPage.war 上传到服务器端/home/ubuntu/developer/apache-tomcat-8.5.14/webapps下
7.启动Tomcat
进入到服务器tomcat 目录下
bin/startup.sh
8.运行测试
浏览器打开 http://你的IP:端口/你的项目
可能会遇到的一些问题
解决方法
hdfs dfs -chmod -R 755 /tmp
linux下实时查看tomcat运行日志
1、先切换到:cd tomcat/logs
2、tail -f catalina.out
3、这样运行时就可以实时查看运行日志了
Ctrl+c 是退出tail命令。