记录一下最近学习FilesPipelines遇到的一些坑
FIlesPipelines使用中必须修改的几项:
1.item文件中必须添加files和file_urls
import scrapy
class Txt80NItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
file_urls = scrapy.Field()
files = scrapy.Field()
2.spider文件中获取文件名和url
def parse_last(self, response):
s =bytes.decode(response.body)
url = re.search('http.*\.txt', s).group()#获取下载链接
f_name = re.search('[\u4E00-\u9FA5]+', url).group()#获取文件名
item = Txt80NItem()
item['file_urls'] = url
item['name'] = f_name
yield item
3.pipelines文件中要导入FilesPipeline模块和Request模块,下载的文件默认名称是url的哈希值,如果需要修改文件名,必须重写file_path
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.files import FilesPipeline
from scrapy import Request
class Txt80NPipeline(FilesPipeline):
# def process_item(self, item, spider):
# return item
def get_media_requests(self, item, info):
yield Request(item['file_urls'], meta = {'name':item['name']})
def file_path(self, request, response=None, info=None):#修改文件名
txt_guid = request.meta['name']
return '%s.txt' % (txt_guid)
4.settings文件中要添加FILES_STORE并启用ITEM_PIPELINES
FILES_STORE = 'E:/TXT'
ITEM_PIPELINES = {
'txt80n.pipelines.Txt80NPipeline': 300,#数值1-1000之间,数字越小优先级越高
}