1.首先创建一个scrapy项目:
进入需要创建项目的目录使用命令:scrapy startproject [项目名称]
之后进入项目目录创建爬虫:scrapy genspider [爬虫名称] [域名]
到这里scrapy项目就创建完毕了。
2.分析页面源代码:
登录后找到收藏内容就可以使用xpath,css、正则表达式等方法来解析了。
准备工作做完——开干!
第一步就是要解决模拟登录的问题,这里我们采用在下载中间中使用selenium模拟用户点击来输入账号密码并且登录。
这样在下载中间件中,我们就需要区分什么样的request需要用我们的selenium,什么样的request不需要用。(毕竟使用selenium太耗内存了,我们不到万不得已最好不要使用selenium,这里使用主要是为了记录学习的过程。)
为了解决鉴别request类别的问题,我们自定义一个新的request并且继承scrapy的request,这样我们就可以造出一个和原始request功能完全一样但类型不一样的request了。
创建一个.py文件,写一个类名为SeleniumRequest的类:
import scrapy
class SeleniumRequest(scrapy.Request):
pass
之后,我们需要在中间件中写好一个分类的方法,这里我们使用isinstance()方法,来判断实例类型
class ScrapyDeomo1DownloaderMiddleware:
def __init__(self):
self.webdriver = None
@classmethod
def from_crawler(cls, crawler):
# This method is used by Scrapy to create your spiders.
s = cls()
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
return s
def process_request(self, request, spider):
if isinstance(request, SeleniumRequest):
# 需要使用Selenium来点击
pass
return None
def process_response(self, request, response, spider):
return response
def process_exception(self, request, exception, spider):
pass
def spider_opened(self, spider):
# 在爬虫开启是创建一个窗口
self.webdriver = webdriver.Chrome(r'G:\web\chromedriver.exe')
def spider_closed(self,spider):
pass
通过整理登录步骤,我们知道需要在Selenium实现:
1.输入用户名
2.输入密码
3.点击勾选
4.点击登录
代码实现如下(使用Selenium找到一个个按钮点击输入就行了):
def process_request(self, request, spider):
# Called for each request that goes through the downloader
# middleware.
# Must either:
# - return None: continue processing this request
# - or return a Response object
# - or return a Request object
# - or raise IgnoreRequest: process_exception() methods of
# installed downloader middleware will be called
if isinstance(request, SeleniumRequest):
self.webdriver.get(request.url)
self.webdriver.find_element(by=By.XPATH, value='/html/body/form/dl/dd[2]/input')\
.send_keys('你的账号')
self.webdriver.find_element(by=By.XPATH, value='/html/body/form/dl/dd[3]/input')\
.send_keys('你的密码')
# //*[@id="protocol"]
self.webdriver.find_element(by=By.XPATH, value='//*[@id="protocol"]') \
.click()
# /html/body/form/dl/dd[5]/input
self.webdriver.find_element(by=By.XPATH, value='/html/body/form/dl/dd[5]/input') \
.click()
time.sleep(2)
page_source = self.webdriver.page_source
# 最后需要将得到的网页封装成Response,对于response类的对象返回 就不会继续进入下面的中间件了。
return HtmlResponse(
url=request.url,
status=200,
body=page_source,
request=request,
encoding="utf-8"
)
# 返回none的话,就会继续进入下一个中间件知道走完
return None
实现模拟登录之后,我们在spider中写解析获取对应的数据就行了,这里的代码就不详细说明了,就是简单的xpath解析:
from scrapy_deomo1.myrequests import SeleniumRequest
from scrapy_deomo1.items import NovelItem
class ZhipingSpider(scrapy.Spider):
name = 'zhiping'
allowed_domains = ['17k.com']
start_urls = ['https://passport.17k.com/login/']
def start_requests(self):
yield SeleniumRequest(url=self.start_urls[0],
callback=self.parse)
def parse(self, response, **kwargs):
a_list = response.xpath('/html/body/div[2]/div/div/div[1]/div[1]/div/div/a')
# novel_item = NovelItem()
meta = {}
for a_item in a_list:
info_href = 'https:' + a_item.xpath('@href').extract_first()
image_src = a_item.xpath('./img/@src').extract_first()
title = a_item.xpath('./p/text()').extract_first()
# novel_item['title'] = title
# novel_item['image_src'] = image_src
# novel_item['info_href'] = info_href
meta['title'] = title
meta['image_src'] = image_src
meta['info_href'] = info_href
yield scrapy.Request(
url=info_href,
callback=self.parse_info,
meta=meta
)
def parse_info(self, response, **kwargs):
novel_item = NovelItem()
novel_item['title'] = response.meta['title']
novel_item['image_src'] = response.meta['image_src']
novel_item['info_href'] = response.meta['info_href']
introduction = response.xpath('//p[@class="intro"]/a/text()').extract_first()
novel_item['introduction'] = introduction
yield novel_item
其中Item文件,我们只需要
title = scrapy.Field() # 小说名
image_src = scrapy.Field() # 小说图片的路径
info_href = scrapy.Field() # 小说详情页面
introduction = scrapy.Field() # 小说介绍
image_path = scrapy.Field() # 小说图片存入的本地路径
在获取到以上的数据之后,我们需要在pipeline管道中进行数据的存储,包括文本数据写入数据库,图片存在本地:
class ScrapyDeomo1Pipeline:
def process_item(self, item, spider):
cursor = self.conn.cursor()
sql = "insert into novel (title, image_path_local, introduce,image_path_network) values (%s, %s, %s,%s)"
cursor.execute(sql, (item['title'], item['image_path'], item['introduction'], item['image_src']))
self.conn.commit()
return item
def open_spider(self, spider):
print('here')
self.conn = pymysql.connect(host=MYSQL['host'],
port=MYSQL['port'],
user=MYSQL['username'],
password=MYSQL['password'],
database=MYSQL['database']
)
def close_spider(self, spider):
if self.conn:
self.conn.close()
class MYImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
# 负责下载
return scrapy.Request(item['image_src'])
def file_path(self, request, response=None, info=None, *, item=None):
# 准备文件名
file_name = request.url.split('/')[-1].split('-')[0]
return f'./img/{file_name}'
def item_completed(self, results, item, info):
# 返回文件下载的详细信息
print(results)
ok, finfo = results[0]
item['image_path'] = finfo["path"]
return item
这里我写了两个管道,一个是存图片的管道,我们可以使用scrapy准备好的图片下载的管道,继承就好了。继承好了之后,我们只需要在三个方法中写好:1.图片下载的路径(在item中) 2.图片下载的本地路径 (这里我们需要在setting文件中写好IMAGES_STORE = '你想写入的路径' 3.获取下载结果,这里我们需要把下载结果中的本地路径和名字获取到写入item中,将本地路径一并写入数据库中。
class MYImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
# 负责下载
return scrapy.Request(item['image_src'])
def file_path(self, request, response=None, info=None, *, item=None):
# 准备文件名
file_name = request.url.split('/')[-1].split('-')[0]
return f'./img/{file_name}'
def item_completed(self, results, item, info):
# 返回文件下载的详细信息
print(results)
ok, finfo = results[0]
item['image_path'] = finfo["path"]
return item
另外一个管道就是写入数据库了,这里仅仅是pymysql的使用比较简单,只需注意在setting中设置好连接数据库的参数就行了。
class ScrapyDeomo1Pipeline:
def process_item(self, item, spider):
cursor = self.conn.cursor()
sql = "insert into novel (title, image_path_local, introduce,image_path_network) values (%s, %s, %s,%s)"
cursor.execute(sql, (item['title'], item['image_path'], item['introduction'], item['image_src']))
self.conn.commit()
return item
def open_spider(self, spider):
print('here')
self.conn = pymysql.connect(host=MYSQL['host'],
port=MYSQL['port'],
user=MYSQL['username'],
password=MYSQL['password'],
database=MYSQL['database']
)
def close_spider(self, spider):
if self.conn:
self.conn.close()
最后运行就可以了,我们看看运行的结果: