爬黄页88网的所有企业信息http://b2b.huangye88.com/region/
首先得安装scrapy 和 pymongo
简单的安装和创建爬虫项目我们就简单的过一下
pip install scrapy
pip install pymongo
scrapy startproject sellsystem
在spiders目录下创建我们的爬虫文件
import copy
import scrapy
from ..items import SellItem
# 先下一页
class indexSpider(scrapy.Spider):
name = 'sell_finally'
all_province = []
start_urls = [
'http://b2b.huangye88.com/region/'
]
page = 1
def parse(self, response): # 入口程序
urls = response.xpath('//dl[@id="clist"]/dd/a/@href').extract()
for itm in urls:
print(itm)
print('111111111111')
yield scrapy.Request(itm, callback=self.parse_qu) # url
def parse_qu(self, response): # http://b2b.huangye88.com/anyang/
uurls = response.xpath('//*[@id="subarealist"]/div[2]/a/@href').extract()
for url in uurls:
print(url)
print('22222222222222')
yield scrapy.Request(url, callback=self.parse_instury_list) # url
def parse_instury_list(self, response): # 各种不同的行业
item = SellItem()
urls = response.xpath('//div[@class="tag_tx"]/ul/li/a/@href').extract()
privince = response.xpath('//div[@class="subNav"]/a[2]/text()').extract()[0][:-4] # 省
city = response.xpath('//div[@class="subNav"]/a[3]/text()').extract()[0][:-4] # 市
district = response.xpath('/html/body/div[3]/div[1]/text()').extract()[2] # 区
item['privince'] = privince # 省
item['city'] = city # 市
item['district'] = district[district.find('市') + 1:-6] # 区
for itm in urls:
print('33333333333333')
print(item)
yield scrapy.Request(itm, callback=self.parse_instury, meta={'item': copy.deepcopy(item)},dont_filter=True)
def parse_instury(self, response): # 行业详情
print('--------------------------')
seitem = response.meta['item']
print(seitem)
print(response.url)
# items = response.xpath('//*[@id="jubao"]/dl/dt/h4/a/text()')
# 该页所有的企业url
content_urls = response.xpath('//*[@id="jubao"]/dl/dt/h4/a/@href').extract()
if len(content_urls) > 0:
for itm in content_urls:
itm = itm + 'company_contact.html' # 进入联系我们
print(itm)
print('4444444444444')
yield scrapy.Request(itm, callback=self.parse_content, meta={'item': copy.deepcopy(seitem)},dont_filter=True)
# 下一页
hrefs = response.xpath(
'//div[@class="page_tag Baidu_paging_indicator"]/span/following-sibling::a[1]/@href').extract()
if len(hrefs) > 0:
print('下一页------')
yield scrapy.Request(hrefs[0], callback=self.parse_instury, meta={'item': copy.deepcopy(seitem)},dont_filter=True)
def parse_content(self, response): # 内容页 联系我们
item = response.meta['item']
item['page_url'] = response.url
print('===================')
print(item)
# 法人
li_array = response.xpath('//ul[@class="con-txt"]/li').extract()
index = 0
for p in li_array:
title = response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/label/text()').extract()[0]
print('index : %' + str(index) + ' len : ' + str(len(li_array)) + ' title : ' + title)
if title == '联系人:':
tt = response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()
if len(tt) > 0:
item['link_people'] = tt[0]
else:
item['link_people'] = \
response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/a/text()').extract()[
0] # 1联系人
if title == '公司名称:':
item['company_name'] = \
response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()[0] # 1公司名称
if title == '地址:':
item['compay_place'] = \
response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()[0] # 1地址
if title == '电话:':
item['phone'] = response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()[
0] # 1联系人电话
if title == '手机:':
item['phone2'] = response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()[
0] # 1联系人手机
if title == '公司主页:':
item['website'] = \
response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/a/text()').extract()[0] # 1公司主页
index += 1
uu2 = response.xpath('//ul[@class="meun"]/a[2]/@href').extract()
print('uu2: ' + uu2[0])
if len(uu2) > 0:
yield scrapy.Request(url=uu2[0], callback=self.parse_content2, meta={'item': copy.deepcopy(item)},dont_filter=True)
def parse_content2(self, response): # 内容页2 公司介绍
item = response.meta['item']
# 列表
li_array = response.xpath('//ul[@class="con-txt"]/li').extract()
print('3333333333333333333333333333333333333333')
print(li_array)
lenss = len(li_array)
index = 0
for p in li_array:
title = response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/label/text()').extract()[0]
if title == '成立时间:':
item['establish_time'] = \
response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()[0]
print('成立时间:' + item['establish_time'])
if title == '员工人数:':
item['company_people_num'] = \
response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()[0]
if title == '主营产品:':
item['product'] = response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()[
0]
if title == '主营行业:':
item['industry'] = \
response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/a/text()').extract()[0]
if title == '企业法人:':
item['faren'] = response.xpath('//ul[@class="con-txt"]/li[' + str(index + 1) + ']/text()').extract()[0]
index += 1
item['introdocution'] = response.xpath('//p[@class="txt"]/text()').extract()[0]
yield copy.deepcopy(item)
需要注意的是我们在这里yied使用的meta数据scrapy默认是浅复制,多线程下会发生数据错乱,采用深度复制就可以了copy.deepcopy()
我们的item文件
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class SellItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
link_people = scrapy.Field() #联系人
phone = scrapy.Field()#电话
phone2 = scrapy.Field()#电话
company_name = scrapy.Field()#公司名称
company_instury = scrapy.Field() #主营产品
compay_place = scrapy.Field()#公司地址
website = scrapy.Field()#公司主页
privince = scrapy.Field()#省
city = scrapy.Field()#市
district = scrapy.Field()#区
establish_time =scrapy.Field()#成立时间
company_people_num =scrapy.Field()#员工人数
product =scrapy.Field()#主营产品
industry =scrapy.Field()#行业
faren =scrapy.Field()#法人
introdocution = scrapy.Field() # 简介
page_url = scrapy.Field() # 当前访问的url
对采集后的数据进行处理pipelines.py,保存在MongoDB里
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
from .items import SellItem
import pymongo
from scrapy.conf import settings
class SellsystemPipeline(object):
def __init__(self):
host = settings['MONGODB_HOST']
port = settings['MONGODB_PORT']
dbName = settings['MONGODB_DBNAME']
client = pymongo.MongoClient(host=host,port=port)
tdb = client[dbName]
self.post = tdb[settings['MONGODB_DOCNAME']]
def process_item(self, item, spider):
bookInfo = dict(item)
self.post.insert(bookInfo)
return item
在setting.py 文件中设置MongoDB的参数
MONGODB_HOST = '127.0.0.1'
MONGODB_PORT = 27017
MONGODB_DBNAME = 'sell'
MONGODB_DOCNAME = 'company'
在项目根目录下创建一个main.py文件
from scrapy import cmdline
cmdline.execute('scrapy crawl sell_finally'.split())
最后运行我们的main.py文件
大概20分钟有10w多条数据,这个看个人网速