2021-07-22

本周四，项目也是在有条不紊的继续进行中。

针对外国网站如何爬取信息，我进行了学习和实践。

有请主角scrapy登场

我们建一个abroadwebsite的项目和名为abroad的爬虫（通用爬虫 -t crawl）

先分析站点信息

会发现每一个站点网址都会有“site”这个字符，把它存入Rules LinkExtractor 中的allow里

打开网址

这里有网站的具体信息，我们用xpath把自己认为有用的提取出来就行

最后我们还要把每一页到下一页的节点分析出来

这里把下一页的网址存入Rules LinkExtractor中就可以一页页地爬取了

分析完毕上代码（只上改动了的）

爬虫 abroad

# -*- coding: utf-8 -*-

import scrapy

from scrapy.linkextractors import LinkExtractor

from scrapy.spiders import CrawlSpider, Rule

from abroadwebsite.items import *

USER_AGENT = "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36"

class AbroadSpider(CrawlSpider):

name = 'abroad'

allowed_domains = ['www.kanguowai.com']

start_urls = ['https://www.kanguowai.com/site/']

rules = (

Rule(LinkExtractor(allow='site\.*', restrict_xpaths='//dl[@class="picture_lie"]'), callback='parse_item'),

Rule(LinkExtractor(restrict_xpaths='//div[@class="page"]//a[@title="下一页"]'))

)

def start_requests(self):

for url in self.start_urls:

yield scrapy.Request(url, headers={"User-Agent": USER_AGENT})

def parse_item(self, response):

item=AbroadwebsiteItem()

item['website_name']=response.xpath('//ul[@class="baseinfo"]/li[1]/h1/text()').extract_first()

item["country"]=response.xpath('//li[@class="linfo"]/a/text()').extract_first()

item["url"]=response.xpath('//li[@class="linfo siteurl"]/a/text()').extract_first()

item["form"]=response.xpath('//li[@class="rinfo"]/a/text()').extract_first()

item["introduction"]=response.xpath('//div[@class="sitetext"]/p/text()').extract_first()

item["img_path"]=response.xpath('//div[@class="sitepic"]/img/@src').extract_first()

yield item

pass

items

# -*- coding: utf-8 -*-

# Define here the models for your scraped items

# See documentation in:

# https://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class AbroadwebsiteItem(scrapy.Item):

# define the fields for your item here like:

# name = scrapy.Field()

website_name=scrapy.Field()

country=scrapy.Field()

url=scrapy.Field()

form=scrapy.Field()

introduction=scrapy.Field()

img_path=scrapy.Field()

settings只上一点有用的

ITEM_PIPELINES = {

'abroadwebsite.pipelines.ImagesPipeline':300,

'abroadwebsite.pipelines.AbroadwebsitePipeline': 301,

}

MYSQL_HOST='localhost'

MYSQL_DATABASE='spider'

MYSQL_ROOT='root'

MYSQL_PASSWORD='123'

USE='use spider'

TABLE='abroadwebsites'

DROP="drop table if exists %s"%TABLE

CREATE='create table %s(website_name varchar(255) NOT NULL,country varchar(255),url varchar(255),form varchar(255),introduction varchar(255),img_path varchar(255))'%TABLE

SAVEIN='insert into '+TABLE+' (website_name,country,url,form,introduction,img_path) values(%s,%s,%s,%s,%s,%s)'

Root_path='D:/pics1/'

IMAGES_STORE='D:/pics1'

pipelines比较复杂包括了保存图片的方法

# -*- coding: utf-8 -*-

# Define your item pipelines here

# Don't forget to add your pipeline to the ITEM_PIPELINES setting

# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

from abroadwebsite.settings import *

import pymysql

import logging

from scrapy import Request

from scrapy.exceptions import DropItem

from scrapy.pipelines.images import ImagesPipeline

class ImagesPipeline(ImagesPipeline):

def get_media_requests(self,item,info):

yield Request('https://www.kanguowai.com'+item["img_path"])

def file_path(self,request,response=None,info=None):

url=request.url

file_name=url.split('/')[-1]

return file_name

def item_completed(self,results,item,info):

image_paths=[x['path'] for ok,x in results if ok]

if not image_paths:

raise DropItem("Image Downloaded Failed")

return item

class AbroadwebsitePipeline(object):

def __init__(self):

self.connect=pymysql.connect(MYSQL_HOST,MYSQL_ROOT,MYSQL_PASSWORD,MYSQL_DATABASE)

self.cursor=self.connect.cursor()

self.cursor.execute(USE) # 选定数据库

self.cursor.execute(DROP)

self.cursor.execute(CREATE)

def process_item(self, item, spider):

try:

self.cursor.execute(SAVEIN,(item["website_name"],item["country"],item["url"],item["form"],item["introduction"],Root_path+item["img_path"].split('/')[-1]))

self.connect.commit()

except Exception as error:

logging.log(error)

return item,

def close_spider(self,spider):

self.connect.close()

这四部分代码包括了爬取数据——保存图片 ——存入数据库的三个目的。

2021-07-22

推荐阅读更多精彩内容