目标:获取来自每个国家的香水品牌名单
技术路线:scrapy + mysql
爬虫文件结构:
代码:
fragrantica.py
# -*- coding: utf-8 -*-
import scrapy
import re
from Fgt.items import FgtItem
class FragranticaSpider(scrapy.Spider):
name = 'fragrantica'
allowed_domains = ['fragrantica.asia']
start_urls = ['https://www.fragrantica.asia/country/China.html']
agent = " "
def parse(self, response):
sel = scrapy.Selector(response)
# 获取国家
a_list = sel.css('select option ').extract()
pat = 'value="(.*?.html)"'
for i in a_list:
res = re.compile(pat).findall(i)[0]
#构建网址
url='https://www.fragrantica.asia'+res
yield scrapy.Request(url=url, callback=self.parse_info, dont_filter=True)
def parse_info(self, response):
sel = scrapy.Selector(response)
# 获取品牌
b_list = sel.css('.nduList a ').extract()
for j in b_list:
url = 'https://www.fragrantica.asia'
pat1 = 'href="(.*?.html)"' #提取品牌网址
pat2 = 'src="(.*?.jpg)"' #提取品牌logo
pat3 = '.html">(.*?) <br><img src=' #提取品牌名称
res1 = url + re.compile(pat1).findall(j)[0]
res2 = re.compile(pat2).findall(j)
res3 = re.compile(pat3).findall(j)
item_one = FgtItem()
item_one['html'] = res1
item_one['logo'] = res2[0]
item_one['company'] = res3[0]
yield item_one
item.py
# -*- coding: utf-8 -*-
import scrapy
class FgtItem(scrapy.Item):
html= scrapy.Field() #品牌网址
company = scrapy.Field() #品牌名称
logo = scrapy.Field() #品牌logo
def get_insert_sql(self):
#具体插入值
insert_sql = """
insert into fgt(html,company,logo)
VALUES (%s, %s, %s)
"""
params = (self["html"],
self["company"],
self["logo"],
)
return insert_sql, params
pipeline.py
# -*- coding: utf-8 -*-
import MySQLdb
import MySQLdb.cursors
from twisted.enterprise import adbapi
class FgtPipeline(object):
def process_item(self, item, spider):
return item
# 异步操作mysql插入
class MysqlTwistedPipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
class
def from_settings(cls, settings):
# setting值可以当做字典来取值
dbparms = dict(
host=settings["MYSQL_HOST"],
db=settings["MYSQL_DBNAME"],
user=settings["MYSQL_USER"],
passwd=settings["MYSQL_PASSWORD"],
charset='utf8',
cursorclass=MySQLdb.cursors.DictCursor,
use_unicode=True,
)
# 连接池ConnectionPool
dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
# 此处相当于实例化pipeline, 要在init中接收。
return cls(dbpool)
def process_item(self, item, spider):
# 使用twisted将mysql插入变成异步执行:参数1:我们自定义一个函数,里面可以写我们的插入逻辑
query = self.dbpool.runInteraction(self.do_insert, item)
# 添加自己的处理异常的函数
query.addErrback(self.handle_error, item, spider)
def do_insert(self, cursor, item):
# 执行具体的插入
# 根据不同的item 构建不同的sql语句并插入到mysql中
insert_sql, params = item.get_insert_sql()
cursor.execute(insert_sql, params)
def handle_error(self, failure, item, spider):
# 处理异步插入的异常
print(failure)
数据库表设计
CREATE TABLE fgt(
html VARCHAR(100),
company VARCHAR(40),
logo VARCHAR(100)
);
爬取结果,共计3276条数据