1.新建项目
scrapy startproject cnblog
2.pycharm 打开项目
3.新建spider
新建main.py
from scrapy import cmdline
cmdline.execute("scrapy crawl cnblog".split())
爬虫代码
import scrapy
from cnblog.items import CnblogItem
class Cnblog_Spider(scrapy.Spider):
name = "cnblog"
allowed_domains = ["cnblog.com"]
start_urls = [
'https://www.cnblogs.com/',
]
def parse(self, response):
item = CnblogItem()
item['title'] = response.xpath('//a[@class="titlelnk"]/text()').extract()
item['link'] = response.xpath('//a[@class="titlelnk"]/@href').extract()
yield item
item代码
import scrapy
class CnblogItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
title = scrapy.Field()
link = scrapy.Field()
setting
BOT_NAME = 'cnblog'
SPIDER_MODULES = ['cnblog.spiders']
NEWSPIDER_MODULE = 'cnblog.spiders'
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
#user-agent新添加
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
#新修改
ITEM_PIPELINES = {
'cnblog.pipelines.FilePipeline': 300, #实现保存到txt文件
'cnblog.pipelines.mysqlPipeline': 300, # 实现保存到mysql
}
4.存储成text
class FilePipeline(object):
def process_item(self, item, spider):
data = ''
with open('cnblog.txt', 'w', encoding='utf-8') as f:
titles = item['title']
links = item['link']
for i, j in zip(titles, links):
data += i + ':'+j+'\n'
f.write(data)
f.close()
return item