1.首先 下载 与我们 cdh hadoop集群对应的 hadoop 安装文件
hadoop-2.6.0-cdh5.14.2.tar.gz
链接: https://pan.baidu.com/s/1iHm5M-gGZRWLKbzVjbYJmA 密码: q9nv
2. hadoop 解压到自己 本地电脑上
mac /opt
window D 盘
3.将测试服务器的hadoop 的配置文件 复制到自己的hadoop 配置文件目录
4.配置 环境变量 并使之生效
- 如果使用scala java 那先下载 jar包
比如 scala build.sbt
name := "sbtawsHadoop"
version := "0.1"
scalaVersion := "2.12.6"
libraryDependencies ++= Seq(
"org.apache.hadoop" % "hadoop-common" % "2.6.0-cdh5.14.2",
"org.apache.hadoop" % "hadoop-hdfs" % "2.6.0-cdh5.14.2",
"org.apache.hadoop" % "hadoop-client" % "2.6.0-cdh5.14.2",
"org.apache.hadoop" % "hadoop-mapreduce-client-core" % "2.6.0-cdh5.14.2",
"org.apache.hadoop" % "hadoop-mapreduce-client-common" % "2.6.0-cdh5.14.2",
"org.apache.hadoop" % "hadoop-mapreduce-client-jobclient" % "2.6.0-cdh5.14.2",
"org.apache.hbase" % "hbase" % "1.2.0-cdh5.14.2"
)
unmanagedResourceDirectories in Compile += baseDirectory.value /"conf"
unmanagedResourceDirectories in Compile += baseDirectory.value /"data"
unmanagedResourceDirectories in Compile += baseDirectory.value /"public"
resourceDirectory in Compile := baseDirectory.value / "data"
resourceDirectory in Compile := baseDirectory.value / "conf"
resolvers += "Sonatype OSS Snapshots" at "https://oss.sonatype.org/content/repositories/snapshots"
resolvers += "cdh" at "https://repository.cloudera.com/artifactory/cloudera-repos"
assemblyOutputPath in assembly := baseDirectory.value/"count-beat-80201.jar"
assemblyMergeStrategy in assembly := {
case PathList("META-INF", xs@_*) => MergeStrategy.discard
case x => MergeStrategy.first
}
Su hds
Ln -s /usr/local/hadoop-2.6.0-cdh5.14.2/bin/hadoop /usr/local/bin/hadoop
ln -s /usr/local/hadoop/bin/hadoop hadoop
sudo netstat -tulpn | grep :8020
Cdh 配置文件 路径
/run/cloudera-scm-agent/process/350-yarn-RESOURCEMANAGER/
/run/cloudera-scm-agent/process/350-yarn-RESOURCEMANAGER/yarn-site.xml
/var/log/hadoop-yarn/hadoop-cmf-yarn-RESOURCEMANAGER-cdhnode1.log.out
/opt/cloudera/parcel-repo//CDH-5.14.2-1.cdh5.14.2.p0.3/lib/hadoop-yarn/bin/yarn nodemanager
/opt/cloudera/parcels/CDH-5.14.2-1.cdh5.14.2.p0.3/bin/yarn
/opt/cloudera/parcels/CDH-5.14.2-1.cdh5.14.2.p0.3/jars/hadoop-common-2.6.0-cdh5.14.2.jar
/usr/local/Cellar/hadoop/2.8.2/bin/hadoop jar ./count-beat-80201.jar ApplistCount
hdfs://cdhnode1:8020/originData/clientlabel/AA77p2_20180525.txt
http://archive.cloudera.com/cdh5/cdh/5/hadoop-2.6.0-cdh5.14.2.tar.gz
export HADOOP_HOME=/opt/cloudera/parcels/CDH-5.14.2-1.cdh5.14.2.p0.3
export YARN_HOME=$HADOOP_HOME
export PATH=$PATH:$HADOOP_HOME/bin:$YARN_HOME/bin
su hdfs
hadoop jar /opt/hadoop-mapreduce-examples-2.8.2.jar wordcount
/originData/clientlabel/output
hadoop jar ./GeoCreditPro-beat-2.0.jar ApplistCount /originData/clientlabel/AA77p2_20180525.txt /originData/clientlabel/output2
yarn resourcemanager
ApplistCount /originData/clientlabel/AA77p2_20180525.txt /originData/clientlabel/output4
core-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<!--Autogenerated by Cloudera Manager-->
<configuration>
<property>
<name>fs.defaultFS</name>
<value>hdfs://cdhnode1:8020</value>
</property>
<property>
<name>fs.trash.interval</name>
<value>1</value>
</property>
<property>
<name>io.compression.codecs</name>
<value>org.apache.hadoop.io.compress.DefaultCodec,org.apache.hadoop.io.compress.GzipCodec,org.apache.hadoop.io.compress.BZip2Codec,org.apache.hadoop.io.compress.DeflateCodec,org.apache.hadoop.io.compress.SnappyCodec,org.apache.hadoop.io.compress.Lz4Codec</value>
</property>
<property>
<name>hadoop.security.authentication</name>
<value>simple</value>
</property>
<property>
<name>hadoop.security.authorization</name>
<value>false</value>
</property>
<property>
<name>hadoop.rpc.protection</name>
<value>authentication</value>
</property>
<property>
<name>hadoop.security.auth_to_local</name>
<value>DEFAULT</value>
</property>
<property>
<name>hadoop.proxyuser.oozie.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.oozie.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.mapred.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.mapred.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.flume.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.flume.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.HTTP.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.HTTP.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hive.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hive.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hue.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hue.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.httpfs.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.httpfs.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hdfs.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.hdfs.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.yarn.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.yarn.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.security.group.mapping</name>
<value>org.apache.hadoop.security.ShellBasedUnixGroupsMapping</value>
</property>
<property>
<name>hadoop.security.instrumentation.requires.admin</name>
<value>false</value>
</property>
<property>
<name>net.topology.script.file.name</name>
<value>/etc/hadoop/conf.cloudera.yarn/topology.py</value>
</property>
<property>
<name>hadoop.ssl.enabled</name>
<value>false</value>
</property>
<property>
<name>hadoop.proxyuser.llama.hosts</name>
<value>*</value>
</property>
<property>
<name>hadoop.proxyuser.llama.groups</name>
<value>*</value>
</property>
<property>
<name>hadoop.ssl.require.client.cert</name>
<value>false</value>
<final>true</final>
</property>
<property>
<name>hadoop.ssl.keystores.factory.class</name>
<value>org.apache.hadoop.security.ssl.FileBasedKeyStoresFactory</value>
<final>true</final>
</property>
<property>
<name>hadoop.ssl.server.conf</name>
<value>ssl-server.xml</value>
<final>true</final>
</property>
<property>
<name>hadoop.ssl.client.conf</name>
<value>ssl-client.xml</value>
<final>true</final>
</property>
<property>
<name>hadoop.http.logs.enabled</name>
<value>true</value>
</property>
</configuration>
hdfs-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<!--Autogenerated by Cloudera Manager-->
<configuration>
<property>
<name>dfs.namenode.name.dir</name>
<value>file:///data1/dfs/nn,file:///data2/dfs/nn</value>
</property>
<property>
<name>dfs.namenode.servicerpc-address</name>
<value>cdhnode1:8022</value>
</property>
<property>
<name>dfs.https.address</name>
<value>cdhnode1:50470</value>
</property>
<property>
<name>dfs.https.port</name>
<value>50470</value>
</property>
<property>
<name>dfs.namenode.http-address</name>
<value>cdhnode1:50070</value>
</property>
<property>
<name>dfs.replication</name>
<value>2</value>
</property>
<property>
<name>dfs.blocksize</name>
<value>134217728</value>
</property>
<property>
<name>dfs.client.use.datanode.hostname</name>
<value>false</value>
</property>
<property>
<name>fs.permissions.umask-mode</name>
<value>022</value>
</property>
<property>
<name>dfs.namenode.acls.enabled</name>
<value>false</value>
</property>
<property>
<name>dfs.client.use.legacy.blockreader</name>
<value>false</value>
</property>
<property>
<name>dfs.client.read.shortcircuit</name>
<value>false</value>
</property>
<property>
<name>dfs.domain.socket.path</name>
<value>/var/run/hdfs-sockets/dn</value>
</property>
<property>
<name>dfs.client.read.shortcircuit.skip.checksum</name>
<value>false</value>
</property>
<property>
<name>dfs.client.domain.socket.data.traffic</name>
<value>false</value>
</property>
<property>
<name>dfs.datanode.hdfs-blocks-metadata.enabled</name>
<value>true</value>
</property>
</configuration>
mapreduce-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<!--Autogenerated by Cloudera Manager-->
<configuration>
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
<property>
<name>mapred.remote.os</name>
<value>Linux</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.address</name>
<value>cdhnode1:19888</value>
</property>
<property>
<name>mapreduce.jobhistory.webapp.https.address</name>
<value>cdhnode1:19890</value>
</property>
<property>
<name>mapreduce.app-submission.cross-platform</name>
<value>true</value>
</property>
<property>
<name>mapreduce.application.classpath</name>
<value>
/usr/local/hadoop/etc/hadoop,
/usr/local/hadoop/share/hadoop/common/*,
/usr/local/hadoop/share/hadoop/common/lib/*,
/usr/local/hadoop/share/hadoop/hdfs/*,
/usr/local/hadoop/share/hadoop/hdfs/lib/*,
/usr/local/hadoop/share/hadoop/mapreduce/*,
/usr/local/hadoop/share/hadoop/mapreduce/lib/*,
/usr/local/hadoop/share/hadoop/yarn/*,
/usr/local/hadoop/share/hadoop/yarn/lib/*
</value>
</property>
</configuration>
yarn-site.xml
<?xml version="1.0" encoding="UTF-8"?>
<!--Autogenerated by Cloudera Manager-->
<configuration>
<property>
<name>yarn.acl.enable</name>
<value>true</value>
</property>
<property>
<name>yarn.admin.acl</name>
<value>*</value>
</property>
<property>
<name>yarn.log-aggregation-enable</name>
<value>true</value>
</property>
<property>
<name>yarn.log-aggregation.retain-seconds</name>
<value>604800</value>
</property>
<property>
<name>yarn.resourcemanager.ha.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.ha.automatic-failover.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.ha.automatic-failover.embedded</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.recovery.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.zk-address</name>
<value>cdhnode1:2181,cdhmaster:2181,cdhnode2:2181,cdhnode3:2181</value>
</property>
<property>
<name>yarn.resourcemanager.store.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.recovery.ZKRMStateStore</value>
</property>
<property>
<name>yarn.client.failover-sleep-base-ms</name>
<value>100</value>
</property>
<property>
<name>yarn.client.failover-sleep-max-ms</name>
<value>2000</value>
</property>
<property>
<name>yarn.resourcemanager.cluster-id</name>
<value>yarnRM</value>
</property>
<property>
<name>yarn.resourcemanager.work-preserving-recovery.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.ha.id</name>
<value>rm198</value>
</property>
<property>
<name>yarn.resourcemanager.address.rm198</name>
<value>cdhnode1:8032</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address.rm198</name>
<value>cdhnode1:8030</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm198</name>
<value>cdhnode1:8031</value>
</property>
<property>
<name>yarn.resourcemanager.admin.address.rm198</name>
<value>cdhnode1:8033</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm198</name>
<value>cdhnode1:8088</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.https.address.rm198</name>
<value>cdhnode1:8090</value>
</property>
<property>
<name>yarn.resourcemanager.address.rm214</name>
<value>cdhnode3:8032</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.address.rm214</name>
<value>cdhnode3:8030</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.address.rm214</name>
<value>cdhnode3:8031</value>
</property>
<property>
<name>yarn.resourcemanager.admin.address.rm214</name>
<value>cdhnode3:8033</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.address.rm214</name>
<value>cdhnode3:8088</value>
</property>
<property>
<name>yarn.resourcemanager.webapp.https.address.rm214</name>
<value>cdhnode3:8090</value>
</property>
<property>
<name>yarn.resourcemanager.ha.rm-ids</name>
<value>rm198,rm214</value>
</property>
<property>
<name>yarn.resourcemanager.proxy-user-privileges.enabled</name>
<value>true</value>
</property>
<property>
<name>yarn.resourcemanager.nodes.include-path</name>
<value>/run/cloudera-scm-agent/process/350-yarn-RESOURCEMANAGER/nodes_allow.txt</value>
</property>
<property>
<name>yarn.resourcemanager.nodes.exclude-path</name>
<value>/run/cloudera-scm-agent/process/350-yarn-RESOURCEMANAGER/nodes_exclude.txt</value>
</property>
<property>
<name>yarn.resourcemanager.client.thread-count</name>
<value>50</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.client.thread-count</name>
<value>50</value>
</property>
<property>
<name>yarn.resourcemanager.admin.client.thread-count</name>
<value>1</value>
</property>
<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>1024</value>
</property>
<property>
<name>yarn.scheduler.increment-allocation-mb</name>
<value>512</value>
</property>
<property>
<name>yarn.scheduler.maximum-allocation-mb</name>
<value>53931</value>
</property>
<property>
<name>yarn.scheduler.minimum-allocation-vcores</name>
<value>1</value>
</property>
<property>
<name>yarn.scheduler.increment-allocation-vcores</name>
<value>1</value>
</property>
<property>
<name>yarn.scheduler.maximum-allocation-vcores</name>
<value>24</value>
</property>
<property>
<name>yarn.resourcemanager.amliveliness-monitor.interval-ms</name>
<value>1000</value>
</property>
<property>
<name>yarn.am.liveness-monitor.expiry-interval-ms</name>
<value>600000</value>
</property>
<property>
<name>yarn.resourcemanager.am.max-attempts</name>
<value>2</value>
</property>
<property>
<name>yarn.resourcemanager.container.liveness-monitor.interval-ms</name>
<value>600000</value>
</property>
<property>
<name>yarn.resourcemanager.nm.liveness-monitor.interval-ms</name>
<value>1000</value>
</property>
<property>
<name>yarn.nm.liveness-monitor.expiry-interval-ms</name>
<value>600000</value>
</property>
<property>
<name>yarn.resourcemanager.resource-tracker.client.thread-count</name>
<value>50</value>
</property>
<property>
<name>yarn.application.classpath</name>
<value>$HADOOP_CLIENT_CONF_DIR,$HADOOP_CONF_DIR,$HADOOP_COMMON_HOME/*,$HADOOP_COMMON_HOME/lib/*,$HADOOP_HDFS_HOME/*,$HADOOP_HDFS_HOME/lib/*,$HADOOP_YARN_HOME/*,$HADOOP_YARN_HOME/lib/*</value>
</property>
<property>
<name>yarn.resourcemanager.scheduler.class</name>
<value>org.apache.hadoop.yarn.server.resourcemanager.scheduler.fair.FairScheduler</value>
</property>
<property>
<name>yarn.scheduler.fair.allow-undeclared-pools</name>
<value>true</value>
</property>
<property>
<name>yarn.scheduler.fair.user-as-default-queue</name>
<value>true</value>
</property>
<property>
<name>yarn.scheduler.fair.preemption</name>
<value>false</value>
</property>
<property>
<name>yarn.scheduler.fair.preemption.cluster-utilization-threshold</name>
<value>0.8</value>
</property>
<property>
<name>yarn.scheduler.fair.sizebasedweight</name>
<value>false</value>
</property>
<property>
<name>yarn.scheduler.fair.assignmultiple</name>
<value>true</value>
</property>
<property>
<name>yarn.scheduler.fair.continuous-scheduling-enabled</name>
<value>false</value>
</property>
<property>
<name>yarn.scheduler.fair.locality-delay-node-ms</name>
<value>2000</value>
</property>
<property>
<name>yarn.scheduler.fair.locality-delay-rack-ms</name>
<value>4000</value>
</property>
<property>
<name>yarn.scheduler.fair.continuous-scheduling-sleep-ms</name>
<value>5</value>
</property>
<property>
<name>yarn.resourcemanager.max-completed-applications</name>
<value>10000</value>
</property>
<property>
<name>yarn.resourcemanager.zk-timeout-ms</name>
<value>60000</value>
</property>
<property>
<name>yarn.application.classpath</name>
<value>
/usr/local/hadoop/etc/hadoop,
/usr/local/hadoop/share/hadoop/common/*,
/usr/local/hadoop/share/hadoop/common/lib/*,
/usr/local/hadoop/share/hadoop/hdfs/*,
/usr/local/hadoop/share/hadoop/hdfs/lib/*,
/usr/local/hadoop/share/hadoop/mapreduce/*,
/usr/local/hadoop/share/hadoop/mapreduce/lib/*,
/usr/local/hadoop/share/hadoop/yarn/*,
/usr/local/hadoop/share/hadoop/yarn/lib/*
</value>
</property>
</configuration>