六、Scrapy框架– 实战– 古诗文网爬虫实战(3)
在settings.py设置
ITEM_PIPELINES = { 'gsww.pipelines.GswwPipeline': 300,}
pipelines.py代码
import json
class GswwPipeline(object):
def open_spider(self,spider):
self.fp = open("古诗文.txt", 'w', encoding='utf-8')
def process_item(self, item, spider):
self.fp.write(json.dumps(dict(item),ensure_ascii=False)+"\n")
return item
def close_spider(self,spider):
self.fp.close()
items.py代码
import scrapy
class GswwItem(scrapy.Item):
title = scrapy.Field()
dynasty = scrapy.Field()
author = scrapy.Field()
content = scrapy.Field()
gsww_spider示例代码
import scrapy
from lxml import etree
from scrapy.http.response.html import HtmlResponse
from scrapy.selector.unified import Selector
from ..items import GswwItem # 从上层导入模块使用..
class GswwSpiderSpider(scrapy.Spider):
name = 'gsww_spider'
allowed_domains = ['gushiwen.org']
start_urls = ['https://gushiwen.org/default_1.aspx']
def myprint(self,value):
print("="*30)
print(value)
print("="*30)
def parse(self, response):
# self.myprint(type(response))
# response.xpath返回的都是SelectorList对象
# SelectorList:里面存储的都是Selector对象
# SelectorList.getall:可以直接获取xpath中指定的值
# SelectorList.get:可以直接提取第一个值
gsw_divs = response.xpath("//div[@class='left']/div[@class='sons']")
# print(type(gsw_divs))
for gsw_div in gsw_divs:
# 获取列表
title = gsw_div.xpath('.//b/text()').getall()
# 获取列表中第一个值,即标题
# title = gsw_div.xpath('.//b/text()').get()
source = gsw_div.xpath(".//p[@class='source']/a/text()").getall()
dynasty = source[0]
author = source[1]
# 下面的//text()代表的是获取class='contson'下的所有子孙文本
content_list = gsw_div.xpath(".//div[@class='contson']//text()").getall()
content = "".join(content_list).strip()
# self.myprint(title)
# self.myprint(source)
# self.myprint(content)
item = GswwItem(title=title,dynasty=dynasty, author=author, content=content)
yield item
next_href = response.xpath("//a[@id='amore']/@href").get()
if next_href:
next_url = response.urljoin(next_href)
request = scrapy.Request(next_url)
yield request
上一篇文章 第六章 Scrapy框架(五) 2020-03-07 地址:
https://www.jianshu.com/p/cd1f301999c5
下一篇文章 第六章 Scrapy框架(七) 2020-03-09 地址:
https://www.jianshu.com/p/07dbc4fd3efd
以上资料内容来源网络,仅供学习交流,侵删请私信我,谢谢。