安装配置
安装
scrapyd服务端: pip install scrapyd
scrapyd客户端: pip install scrapyd-client
- cat /etc/scrapyd/scrapyd.conf
[scrapyd]
eggs_dir = /data/project/scrapyd/eggs
logs_dir = /data/project/scrapyd/logs
items_dir =
jobs_to_keep = 30
dbs_dir = /data/project/scrapyd/dbs
max_proc = 0
max_proc_per_cpu = 4
finished_to_keep = 40
poll_interval = 5.0
bind_address = 0.0.0.0
http_port = 6800
debug = off
runner = scrapyd.runner
application = scrapyd.app.application
launcher = scrapyd.launcher.Launcher
webroot = scrapyd.website.Root
[services]
schedule.json = scrapyd.webservice.Schedule
cancel.json = scrapyd.webservice.Cancel
addversion.json = scrapyd.webservice.AddVersion
listprojects.json = scrapyd.webservice.ListProjects
listversions.json = scrapyd.webservice.ListVersions
listspiders.json = scrapyd.webservice.ListSpiders
delproject.json = scrapyd.webservice.DeleteProject
delversion.json = scrapyd.webservice.DeleteVersion
listjobs.json = scrapyd.webservice.ListJobs
daemonstatus.json = scrapyd.webservice.DaemonStatus
scrapyd配置解释
[scrapyd]
eggs_dir = eggs # egg文件存放目录 eggs_dir/project/****.egg
logs_dir = logs # 日志文件存放目录 logs_dir/project/spidername/****.log
items_dir = items # item文件存放目录 items_dir/project/spidername/***.jl
jobs_to_keep = 5 # 保留log日志文件数量
dbs_dir = dbs # 存放sqlitedb文件目录 dbs_dir/**.db
max_proc = 0 # 启动scrapyd进程最大数,设置为0则默认最大进程数为 cpus数*max_proc_per_cpu
max_proc_per_cpu = 4 # 每个cpu启动scrapyd进程数
finished_to_keep = 100 # 保留jobs的数量,超过该数自动删除之前的记录
poll_interval = 5.0 # 每次拉去任务的间隔时间
bind_address = 127.0.0.1 # tcp服务绑定的IP
http_port = 6800 # tcp服务绑定端口号
username = # 设置用户名
password = # 设置密码
debug = off # 是否开启debug True or False
runner = scrapyd.runner # 默认的启动类
jobstorage = scrapyd.jobstorage.MemoryJobStorage # jobs保存方式 scrapyd.jobstorage.MemoryJobStorage|scrapyd.jobstorage.SqliteJobStorage 内存保存和sqlite保存
application = scrapyd.app.application # 调用scrapy服务类
launcher = scrapyd.launcher.Launcher # 任务调度启动类
webroot = scrapyd.website.Root # web页面启动类
eggstorage = scrapyd.eggstorage.FilesystemEggStorage # 管理egg文件
[services]
schedule.json = scrapyd.webservice.Schedule # 添加任务接口
cancel.json = scrapyd.webservice.Cancel # 取消任务接口
addversion.json = scrapyd.webservice.AddVersion # 添加project接口
listprojects.json = scrapyd.webservice.ListProjects # 查询所有project项目接口
listversions.json = scrapyd.webservice.ListVersions # 查询所有project 的version接口
listspiders.json = scrapyd.webservice.ListSpiders # 查询project,version下所有的spider
delproject.json = scrapyd.webservice.DeleteProject # 删除project
delversion.json = scrapyd.webservice.DeleteVersion # 删除project指定version
listjobs.json = scrapyd.webservice.ListJobs # 查询出所有项目 包括历史任务,正在执行任务,等待执行任务
daemonstatus.json = scrapyd.webservice.DaemonStatus # 查询scrapyd server的状态
使用技巧
参考文档:https://blog.csdn.net/yanggd1987/article/details/79223842