scrapy-redis占用内存,目前两千万的redis去重队列+mongodb的缓存,15g的内存有点坚持不住了,不知道是不是我的程序有问题,感觉不应该这么快内存就不够用了,决定用布隆过滤将scrapy-redis优化一下,优化原理参考:《python3网络爬虫开发实战》
参考链接:https://cloud.tencent.com/developer/article/1084962
BLOOMFILTER_HASH_NUMBER = 6
BLOOMFILTER_BIT = 30
class HashMap(object):
def __init__(self, m, seed):
self.m = m
self.seed = seed
def hash(self, value):
ret = 0
for i in range(len(value)):
ret += self.seed * ret + ord(value[i])
return (self.m - 1) & ret
class BloomFilter(object):
def __init__(self, server, key, bit=BLOOMFILTER_BIT, hash_number=BLOOMFILTER_HASH_NUMBER):
self.m = 1 << bit
self.seeds = range(hash_number)
self.maps = [HashMap(self.m, seed) for seed in self.seeds]
self.server = server
self.key = key
def exists(self, value):
if not value:
return False
exist = 1
for map in self.maps:
offset = map.hash(value)
exist = exist & self.server.getbit(self.key, offset)
return exist
def insert(self, value):
for f in self.maps:
offset = f.hash(value)
self.server.setbit(self.key, offset, 1)
简单测试一下就能明白流程是怎么进行的了
然后将上面代码保存成一个python文件,将这个文件放置在scrapy-redis源码目录下
需要修改scrapy-redis的源码中的dupefilter.py文件
from .bloom import BloomFilter #bloom是我保存的文件名称
......
def __init__(self, server, key, debug=False): #修改初始化函数
self.server = server
self.key = key
self.debug = debug
self.logdupes = True
self.bf = BloomFilter(server, self.key) #新增
......
def request_seen(self, request): #修改函数
fp = self.request_fingerprint(request)
# This returns the number of values added, zero if already exists.
# added = self.server.sadd(self.key, fp)
# return added == 0
if self.bf.exists(fp):
return True
self.bf.insert(fp)
return False
其它的和正常的scrapy一样,运行方法也是一样的
现在的问题是,如何将现有数据转换成bloom的格式,
我想到的是一个比较笨的方法,将原先的redis中的数据拿出来,用上面的bloom文件处理一边再写入redis中,一定要做好原数据备份
BLOOMFILTER_HASH_NUMBER = 6
BLOOMFILTER_BIT = 30
class HashMap(object):
def __init__(self, m, seed):
self.m = m
self.seed = seed
def hash(self, value):
ret = 0
for i in range(len(value)):
ret += self.seed * ret + ord(value[i])
return (self.m - 1) & ret
class BloomFilter(object):
def __init__(self, server, key, bit=BLOOMFILTER_BIT, hash_number=BLOOMFILTER_HASH_NUMBER):
self.m = 1 << bit
self.seeds = range(hash_number)
self.maps = [HashMap(self.m, seed) for seed in self.seeds]
self.server = server
self.key = key
def exists(self, value):
if not value:
return False
exist = 1
for map in self.maps:
offset = map.hash(value)
exist = exist & self.server.getbit(self.key, offset)
return exist
def insert(self, value):
for f in self.maps:
offset = f.hash(value)
self.server.setbit(self.key, offset, 1)
import redis
conn = redis.Redis(host="redis的ip地址", port="6379")
bf = BloomFilter(conn, "country1:dupefilter") #写到redis的目标string中
while conn.scard('country:dupefilter'): #判断原redis是不是已经转换完成了
s = str(conn.spop('country:dupefilter'))[2:-1] #一定要做好原数据备份,将原redis中的数据删除返回一个
bf.insert(s) #因为原先的reids已经去重过了,所以这里没有用bloom判断是否重复,直接写入bloom
如果一开始用的不是scrapy-redis去重,后来发现内存不足了,那现在需要手动生成指纹,再保存到布隆中
from scrapy.utils.request import request_fingerprint
class Request_Fingerprint(object):
def __init__(self, url):
self.method = 'GET' # 如果是post请求,请修改成POST
self.url = url
self.body = b''
def run(url):
'''
手动添加url指纹,存在返回True,不存在返回False
:param url: 传入需要去重的url
:return: 存在返回True,不存在返回False
'''
re = Request_Fingerprint(url=url)
fp = request_fingerprint(re)
return fp #scrapy-redis指纹
目前整个流程就是这样了,有什么想法请下方留言