Requirements
- Java 8
cd /opt
wget http://mirrors.hust.edu.cn/apache/zookeeper/stable/zookeeper-3.4.7.tar.gz
tar -zvxf zookeeper-3.4.7.tar.gz
mv zookeeper-3.4.7 zookeeper
cd zookeeper
cd conf
cp zoo_sample.cfg zoo.cfg
vi zoo.cfg
tickTime=2000
initLimit=10
syncLimit=5
dataDir=/home/hadoop/zookeeper/export
clientPort=2181
#server.1=zoo1:2888:3888
#server.2=zoo2:2888:3888
#server.3=zoo3:2888:3888
Install Kafka
cd /opt
wget http://mirrors.hust.edu.cn/apache/kafka/0.9.0.0/kafka-0.9.0.0-src.tgz
tar -zvxf kafka-0.9.0.0-src.tgz
mv kafka-0.9.0.0 kafka
Install Hadoop
cd /opt
wget http://mirrors.hust.edu.cn/apache/hadoop/common/hadoop-2.7.1/hadoop-2.7.1.tar.gz
tar -zvxf hadoop-2.7.1.tar.gz
mv hadoop-2.7.1 hadoop
cd hadoop
vi etc/hadoop/hdfs-site.xml
<property>
<name>dfs.datanode.max.transfer.threads</name>
<value>4096</value>
</property>
<property>
<name>dfs.replication</name >
<value>1</value>
</property>
<property>
<name>dfs.name.dir</name>
<value>file:///opt/hadoop/hadoopinfra/hdfs/namenode</value>
</property>
<property>
<name>dfs.data.dir</name>
<value>file:///opt/hadoop/hadoopinfra/hdfs/datanode</value>
</property>
vi etc/hadoop/hadoop-env.sh
export JAVA_HOME=/usr
vi etc/hadoop/core-site.xml
<configuration>
<property>
<name>fs.default.name</name>
<value>hdfs://master:9000</value>
</property>
</configuration>
vi etc/hadoop/core-site.xml
<property>
<name>fs.default.name</name>
<value>hdfs://localhost:9000</value>
</property>
vi etc/hadoop/yarn-site.xml
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
cp etc/hadoop/mapred-site.xml.template etc/hadoop/mapred-site.xml
vi etc/hadoop/mapred-site.xml
<property>
<name>mapreduce.framework.name</name>
<value>yarn</value>
</property>
Initial format of namenode - format will erase any existing data
hdfs namenode -format
Install HBase
cd /opt
wget http://mirrors.hust.edu.cn/apache/hbase/stable/hbase-1.1.2-src.tar.gz
tar -zvxf hbase-1.1.2-src.tar.gz
mv hbase-1.1.2 hbase
cd hbase
vi conf/hbase-env.sh
export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk
export HBASE_REGIONSERVERS=${HBASE_HOME}/conf/regionservers
export HBASE_MANAGES_ZK=false
# Configure PermSize. Only needed in JDK7\. You can safely remove it for JDK8+
#export HBASE_MASTER_OPTS="$HBASE_MASTER_OPTS -XX:PermSize=128m -XX:MaxPermSize=128m"
#export HBASE_REGIONSERVER_OPTS="$HBASE_REGIONSERVER_OPTS -XX:PermSize=128m -XX:MaxPermSize=128m"
vi conf/hbase-site.xml
<configuration>
<property>
<name>hbase.zookeeper.quorum</name>
<value>master,data2,data3</value>
<description>The directory shared by RegionServers.
</description>
</property>
<property>
<name>hbase.zookeeper.property.dataDir</name>
<value>/home/hadoop/zookeeper/export</value>
<description>Property from ZooKeeper config zoo.cfg.
The directory where the snapshot is stored.
</description>
</property>
<property>
<name>hbase.rootdir</name>
<value>hdfs://master:9000/home/hadoop/hbase</value>
<description>The directory shared by RegionServers.
</description>
</property>
<property>
<name>hbase.cluster.distributed</name>
<value>true</value>
<description>The mode the cluster will be in. Possible values are
false: standalone and pseudo-distributed setups with managed Zookeeper
true: fully-distributed with unmanaged Zookeeper Quorum (see hbase-env.sh)
</description>
</property>
</configuration>
vi conf/regionservers
data2
data3
ENV
vi ~/.bashrc
如果是用yum安装了jdk1.8,那就不要配置 export JAVA_HOME=/opt/jdk1.8.0_40
#HADOOP VARIABLES START
#export JAVA_HOME=/usr/lib/jvm/java-1.8.0-openjdk
export JAVA_HOME=/opt/jdk1.8.0_40
export HADOOP_INSTALL=/opt/hadoop
export PATH=$PATH:$HADOOP_INSTALL/bin
export PATH=$PATH:$HADOOP_INSTALL/sbin
export HADOOP_MAPRED_HOME=$HADOOP_INSTALL
export HADOOP_COMMON_HOME=$HADOOP_INSTALL
export HADOOP_HDFS_HOME=$HADOOP_INSTALL
export YARN_HOME=$HADOOP_INSTALL
export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_INSTALL/lib/native
export HADOOP_OPTS="-Djava.library.path=$HADOOP_INSTALL/lib/native"
export HADOOP_HOME=$HADOOP_INSTALL
#HADOOP VARIABLES END
#HBASE VARIABLES
export HBASE_HOME=/opt/hbase
export HBASE_CONF=$HBASE_HOME/conf
export CLASSPATH=$CLASSPATH:$BASE_HOME/lib/*
#HBASE VARIABLES END
export PATH=$PATH:$HBASE_HOME/bin
export CQLSH_HOST=127.0.0.1
export CQLSH_PORT=9042
source ~/.bashrc
Additional installs
yum install thrift
yum install snappy-devel
pip install sqlalchemy
pip install zmq
pip install pyzmq
Install Distributed Frontera from GIT - Recommended method
cd /opt
git clone https://github.com/scrapinghub/distributed-frontera.git
pip install /opt/distributed-frontera
Install Distributed Frontera with PIP
pip install distributed-frontera
pip install hbase-thrift
pip install PyHBase
required:
happybase, kafka-python, msgpack-python, python-snappy, frontera, thrift
firewall tweaking
sudo firewall-cmd --zone=public --add-port=2181/tcp --permanent
sudo firewall-cmd --zone=public --add-port=60000/tcp --permanent
sudo firewall-cmd --zone=public --add-port=9000/tcp --permanent
sudo firewall-cmd --zone=public --add-port=9000/tcp --permanent
sudo firewall-cmd --reload
start and stop services
hadoop
/opt/hadoop/sbin/start-dfs.sh
/opt/hadoop/sbin/start-yarn.sh
/opt/hadoop/sbin/stop-dfs.sh
/opt/hadoop/sbin/stop-yarn.sh
zookeeper
/opt/zookeeper/bin/zkServer.sh start
/opt/zookeeper/bin/zkServer.sh stop
view zookeeper
/opt/zookeeper/bin/zkCli.sh -server 127.0.0.1:2181
hbase
/opt/hbase/bin/hbase-daemon.sh start master
/opt/hbase/bin/hbase-daemon.sh start regionserver
/opt/hbase/bin/hbase-daemon.sh stop master
/opt/hbase/bin/hbase-daemon.sh stop regionserver
thrift for hbase
hbase thrift start
hbase thrift -p 7777 start
kafka
/opt/kafka/bin/kafka-server-start.sh /opt/kafka/config/server.properties
Verify services are running
jps
must have these running
25571 HMaster
25764 HRegionServer
26420 Main
25110 DataNode
26519 Jps
24968 NameNode
14988 QuorumPeerMain
25310 SecondaryNameNode
sample
https://github.com/scrapinghub/distributed-frontera//blob/master/docs/source/topics/quickstart.rst
/opt/hbase/bin/hbase shell
create_namespace ‘crawler’
quit
cd /var/www/html
cd general-spider
vi frontier/workersettings.py
=== replace content ===
# -*- coding: utf-8 -*-
from frontera.settings.default_settings import *
#from distributed_frontera.settings.default_settings import MIDDLEWARES
from distributed_frontera.settings import default_settings
MAX_REQUESTS = 0
MAX_NEXT_REQUESTS = 128 # Size of batch to generate per partition, should be consistent with
# CONCURRENT_REQUESTS in spider. General recommendation is 5-7x CONCURRENT_REQUESTS
CONSUMER_BATCH_SIZE = 512 # Batch size for updates to backend storage
NEW_BATCH_DELAY = 30.0 # This cause spider to wait for specified time, after getting empty response from
# backend
#--------------------------------------------------------
# Url storage
#--------------------------------------------------------
BACKEND = 'distributed_frontera.backends.hbase.HBaseBackend'
HBASE_DROP_ALL_TABLES = False
HBASE_THRIFT_PORT = 9090
HBASE_THRIFT_HOST = 'localhost'
HBASE_QUEUE_PARTITIONS = 2 # Count of spider instances
STORE_CONTENT = True
MIDDLEWARES.extend([
'frontera.contrib.middlewares.domain.DomainMiddleware',
'frontera.contrib.middlewares.fingerprint.DomainFingerprintMiddleware'
])
KAFKA_LOCATION = 'localhost:9092'
FRONTIER_GROUP = 'scrapy-crawler'
INCOMING_TOPIC = 'frontier-done' # Topic used by spiders where to send fetching results
OUTGOING_TOPIC = 'frontier-todo' # Requests that needs to be downloaded is written there
SCORING_GROUP = 'scrapy-scoring'
SCORING_TOPIC = 'frontier-score' # Scores provided by strategy worker using this channel and read by storage
# worker.
#--------------------------------------------------------
# Logging
#--------------------------------------------------------
LOGGING_EVENTS_ENABLED = False
LOGGING_MANAGER_ENABLED = True
LOGGING_BACKEND_ENABLED = True
LOGGING_DEBUGGING_ENABLED = False
vi frontier/spider_settings.py
=== replace content ===
# -*- coding: utf-8 -*-
from frontera.settings.default_settings import *
#from distributed_frontera.settings.default_settings import MIDDLEWARES
from distributed_frontera.settings import default_settings
SPIDER_PARTITION_ID = 0 # Partition ID assigned
MAX_NEXT_REQUESTS = 256 # Should be consistent with MAX_NEXT_REQUESTS set for Frontera worker
DELAY_ON_EMPTY = 5.0
MIDDLEWARES.extend([
'frontera.contrib.middlewares.domain.DomainMiddleware',
'frontera.contrib.middlewares.fingerprint.DomainFingerprintMiddleware'
])
#--------------------------------------------------------
# Crawl frontier backend
#--------------------------------------------------------
BACKEND = 'distributed_frontera.backends.remote.KafkaOverusedBackend'
KAFKA_LOCATION = 'localhost:9092' # Your Kafka service location
SPIDER_PARTITION_ID = 0 # Partition ID assigned
HBASE_NAMESPACE = 'crawler'
#--------------------------------------------------------
# Logging
#--------------------------------------------------------
LOGGING_ENABLED = True
LOGGING_EVENTS_ENABLED = False
LOGGING_MANAGER_ENABLED = False
LOGGING_BACKEND_ENABLED = False
LOGGING_DEBUGGING_ENABLED = False
open new terminal -> start ZeroMQ broker
cd /var/www/html/general-spider
python -m distributed_frontera.messagebus.zeromq.broker
open new terminal -> start DB worker
cd /var/www/html/general-spider
python -m distributed_frontera.worker.main --config frontier.workersettings
open new terminal -> start strategy worker
cd /var/www/html/general-spider
python -m distributed_frontera.worker.score --config frontier.strategy0 --strategy distributed_frontera.worker.strategy.bfs
open new terminal -> Starting the spiders
cd /var/www/html/general-spider
scrapy crawl general -L INFO -s FRONTERA_SETTINGS=frontier.spider0 -s SEEDS_SOURCE=seeds_es_dmoz.txt
scrapy crawl general -L INFO -s FRONTERA_SETTINGS=frontier.spider1
注意:
启动顺序为 hadoop -> ZooKeeper -> HBase
关闭顺序为 HBase -> ZooKeeper -> hadoop