1. 背景
之前将pyspider部署在了一台机子上,这一台负责「数据存储」,「消息队列」,「任务调度」,「URL抓取」,「任务处理」。
现在发现有些性能瓶颈,就再开了一台机子做「URL抓取」及「任务处理」。
2. 机子
- 爬虫主机:192.168.1.33
- 附属机:192.168.1.71
3. 安装
- docker
- docker-compose
- docker pull pyspider
4. 非爬虫部分配置
docker配置网络接口:
docker network create --driver bridge pyspider
-
数据库服务:我使用
MongoDB
,由于之前就把服务齐起来了,就不进行虚拟化了(Docker),如果你是新起一个分布式爬虫,建议数据库服务也使用Docker。将下面的db连接改为你自己的:
taskdb:mongodb+taskdb://192.168.1.33:27017/taskdb
projectdb:mongodb+projectdb://192.168.1.33:27017/projectdb
resultdb:mongodb+resultdb://192.168.1.33:27017/resultdb
-
消息队列服务:
redis
。命令:docker run --network=pyspider --name redis -d -p 6379:6379 redis
将下面的queue连接改为你自己的:
message-queue:redis://192.168.1.33:6379/0
注意: 我下面用的非原生pyspider,因为我需要依赖一些pyspider没有的库,请注意替换 (
qbb6/pyspider
->binux/pyspider
)
3. 主机配置
启动 pyspider schedule
模块
docker run --network=pyspider --name scheduler -d -p 23333:23333 --restart=always qbb6/pyspider\
--taskdb "mongodb+taskdb://192.168.1.33:27017/taskdb" \
--resultdb "mongodb+resultdb://192.168.1.33:27017/resultdb" \
--projectdb "mongodb+projectdb://192.168.1.33:27017/projectdb" \
--message-queue "redis://192.168.1.33:6379/0" \
scheduler --inqueue-limit 10000 --delete-time 3600
编写 docker-compose.yml
version: '2'
services:
phantomjs:
image: 'qbb6/pyspider:latest'
command: phantomjs
cpu_shares: 256
environment:
- 'EXCLUDE_PORTS=5000,23333,24444'
expose:
- '25555'
mem_limit: 256m
restart: always
phantomjs-lb:
image: 'dockercloud/haproxy:latest' # 使用haproxy使用负载均衡
links:
- phantomjs
volumes:
- /var/run/docker.sock:/var/run/docker.sock
restart: always
fetcher:
image: 'qbb6/pyspider:latest'
command: '--message-queue "redis://192.168.1.33:6379/0" --phantomjs-proxy "phantomjs:80" fetcher --xmlrpc'
cpu_shares: 256
environment:
- 'EXCLUDE_PORTS=5000,25555,23333'
links:
- 'phantomjs-lb:phantomjs'
mem_limit: 256m
restart: always
fetcher-lb:
image: 'dockercloud/haproxy:latest'
links:
- fetcher
volumes:
- /var/run/docker.sock:/var/run/docker.sock
restart: always
processor:
image: 'qbb6/pyspider:latest'
command: '--projectdb "mongodb+projectdb://192.168.1.33:27017/projectdb" --message-queue "redis://192.168.1.33:6379/0" processor'
cpu_shares: 256
mem_limit: 256m
restart: always
result-worker:
image: 'qbb6/pyspider:latest'
command: '--taskdb "mongodb+taskdb://192.168.1.33:27017/taskdb" --projectdb "mongodb+projectdb://192.168.1.33:27017/projectdb" --resultdb "mongodb+resultdb://192.168.1.33:27017/resultdb" --message-queue "redis://192.168.1.33:6379/0" result_worker'
cpu_shares: 256
mem_limit: 256m
restart: always
webui:
image: 'qbb6/pyspider:latest'
command: '--taskdb "mongodb+taskdb://192.168.1.33:27017/taskdb" --projectdb "mongodb+projectdb://192.168.1.33:27017/projectdb" --resultdb "mongodb+resultdb://192.168.1.33:27017/resultdb" --message-queue "redis://192.168.1.33:6379/0" webui --scheduler-rpc "http://192.168.1.33:23333/" --fetcher-rpc "http://fetcher/"'
cpu_shares: 256
environment:
- 'EXCLUDE_PORTS=24444,25555,23333'
ports:
- '5001:5000'
links:
- 'fetcher-lb:fetcher'
mem_limit: 256m
restart: always
networks:
default:
external:
name: pyspider
构建镜像,创建、启动服务
前台启动:docker-compose up
后台启动:docker-compose up -d
配置 processor
,fetcher
,result-worker
的进程数量
docker-compose scale phantomjs=2 processor=4 result-worker=2
4. 附属机配置
编写 docker-compose.yml
version: '2'
services:
phantomjs:
image: 'qbb6/pyspider:latest'
command: phantomjs
cpu_shares: 256
environment:
- 'EXCLUDE_PORTS=5000,23333,24444'
expose:
- '25555'
mem_limit: 256m
restart: always
phantomjs-lb:
image: 'dockercloud/haproxy:latest'
links:
- phantomjs
volumes:
- /var/run/docker.sock:/var/run/docker.sock
restart: always
fetcher:
image: 'qbb6/pyspider:latest'
command: '--message-queue "redis://192.168.1.33:6379/0" --phantomjs-proxy "phantomjs:80" fetcher --xmlrpc'
cpu_shares: 256
environment:
- 'EXCLUDE_PORTS=5000,25555,23333'
links:
- 'phantomjs-lb:phantomjs'
mem_limit: 256m
restart: always
fetcher-lb:
image: 'dockercloud/haproxy:latest'
links:
- fetcher
volumes:
- /var/run/docker.sock:/var/run/docker.sock
restart: always
processor:
image: 'qbb6/pyspider:latest'
command: '--projectdb "mongodb+projectdb://192.168.1.33:27017/projectdb" --message-queue "redis://192.168.1.33:6379/0" processor'
cpu_shares: 256
mem_limit: 256m
restart: always
result-worker:
image: 'qbb6/pyspider:latest'
command: '--taskdb "mongodb+taskdb://192.168.1.33:27017/taskdb" --projectdb "mongodb+projectdb://192.168.1.33:27017/projectdb" --resultdb "mongodb+resultdb://192.168.1.33:27017/resultdb" --message-queue "redis://192.168.1.33:6379/0" result_worker'
cpu_shares: 256
mem_limit: 256m
restart: always
networks:
default:
external:
name: pyspider
构建镜像,创建、启动服务
后台启动:docker-compose up -d
前台启动:docker-compose up
配置 processor
,fetcher
,result-worker
的进程数量
docker-compose scale phantomjs=2 processor=4 result-worker=2