在请求每个列表页的商家详情页时开15个子线程(每个列表页有15个商家)
这一版是没有加jion()的:
存在一个问题,15个详情页的子线程没有结束的情况下,主线程不会等待(非阻塞),主线程会继续往下翻页(列表页),继续开子线程解析详情页;如果电脑性能不好,再加上网络差或者网站反爬严重,可能同时会有上千个子线程在跑,这对于服务器的压力就比较大了;而且有个问题就是,下载的数据更加的混乱了,可能第一条是第一个列表页的第一个商铺详情,但是第二条却是第50个列表页的第15个商铺详情,这数据就有点混乱了;所以需要优化;
每分钟大概是40.5条数据,折合每天(24小时)58320条数据,似乎速度还是有点慢啊,相当于开了75个子线程
下面加了Joion()的会更慢
#coding:utf-8
import hashlib
import time
from fake_useragent import UserAgent
import requests
# from UA import data
import json
import scylla_test
import kuai_test
from lxml import etree
import re
import csv
from shantou_links import *
import random
from cookie_parse import GetComments
import threading
L = threading.Lock()
class Luoyang:
def __init__(self):
self.city_name = '洛阳'
# 爬虫入口,开始爬取
def get_first_page(self):
ua = UserAgent()
print('全部爬虫开始工作,从后往前')
data = ['id从后往前','店铺名称','所属城市','行政区','地址','二级分类','三级分类','电话','点评数量','最新点评时间']
with open('luoyang_dianping_car.csv', 'a', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(data)
field = '买车卖车'
urls = self.get_target_urls()
self.start(urls)
print('完成')
# 获取所有的初始链接
def get_target_urls(self):
# beau_urls = self.get_target_url(url='http://www.dianping.com/luoyang/ch65/g34072')
# mend_urls = self.get_target_url(url='http://www.dianping.com/luoyang/ch65/g176')
match_urls = self.get_target_url(url='http://www.dianping.com/luoyang/ch65/g177')
# result = mend_urls + beau_urls + match_urls
result = match_urls
print(result)
return result
def get_target_url(self, url):
field = '买车'
while True:
headers = {
}
content = self.url_start(url,headers,field)
if content.status_code == 200 and field in content.text:
tree = etree.HTML(content.text)
urls = tree.xpath('//div[@id="region-nav"]/a/@href')
return urls
# 遍历所有的初始链接,发起请求,保存数据
def start(self,urls):
for first_url in urls:
field = '买车卖车'
# 如果不能正确响应,反复试
while True:
headers = {
}
response = self.url_start(first_url,headers,field)
print(response.status_code)
print(response.text)
if response.status_code == 200 and field in response.text:
print('返回200,页面有需求的字段')
break
else:
print('该请求失败,准备重试')
time.sleep(2)
if 'g34084' in first_url:
type = '4s'
elif 'g34085' in first_url:
type = '综合经销商'
elif 'g34086' in first_url:
type = '二手车'
else:
type = 0
self.parse_firs_full_page(response, type)
# 拿到页数;保存首页全部店铺信息,循环翻页执行,保存操作,如果没有type 那么type就传参0
def parse_firs_full_page(self,response,type=0):
tree = etree.HTML(response.text)
try:
pages = tree.xpath('//a[@class="PageLink"]/text()')[-1]
pages = int(pages)
except:
pages = 0
# 保存首页数据
# url_t = 'http://www.dianping.com/luoyang/ch65/g34085'
# if url_t not in response.url:
# print('经销商页面进来了,首页不保存')
self.one_full_page(response,type)
# 翻页继续保存
self.turn_page(response.url,pages,type)
# 翻页并且保存每页的全部店铺数据
def turn_page(self, url, pages, type):
if pages > 0:
for i in range(2, pages + 1):
time.sleep(5)
start_url = url + 'p{}'
action_url = start_url.format(i)
if i == 2:
headers = {
'Referer': url,
'Host': 'www.dianping.com'
}
else:
headers = {
'Referer': start_url.format(i - 1),
'Host': 'www.dianping.com'
}
print(headers)
field = '买车卖车'
while True:
response = self.url_start(action_url, headers, field)
print(response.status_code)
# print(response.text)
if response.status_code == 200 and field in response.text:
print('该链接请求成功')
break
else:
print('请求失败')
time.sleep(2)
self.one_full_page(response, type)
#发起一个new请求拿到响应数据
def url_start(self,url,headers,field):
while True:
try:
# 捕捉代理超时异常
times = int(time.time())
planText = "orderno=隐藏,secret=b5dd53126b3143fba00dda5fec6b9607,timestamp={}".format(times)
md = hashlib.md5()
md.update(planText.encode('utf-8'))
content = md.hexdigest()
ua = UserAgent()
headers['User-Agent'] = ua.random
headers['Proxy-Authorization'] = 'sign={}&orderno=ZF20186170227TPgMj4×tamp={}'.format(content.upper(), times)
proxies = {'http': 'forward.xdaili.cn:80'}
response = requests.get(url, proxies=proxies, headers=headers)
return response
except:
print ('代理超时,重试.....')
# 下载保存单个列表页的全部店铺详情;
def one_full_page(self,response,type=0):
tree = etree.HTML(response.text)
business_li = tree.xpath('//div[@class="pic"]/a/@href')
headers = {
'Referer': response.url,
'Host': 'www.dianping.com'
}
print(headers)
if len(business_li) > 0:
for business in business_li:
id = re.findall(r'/shop/(\d+)', business)[0]
t = threading.Thread(target=self.parse_detail,args=(business,id,headers,type))
t.start()
else:
print('该页面没有店铺')
# 解析店铺详情页,保存数据,供one_full_page用
def parse_detail(self,url,id,headers,type=0):
field = '地址'
while True:
response = self.url_start(url,headers,field)
print(headers)
if response.status_code == 200 and len(response.text)>0 and field in response.text:
print('请求成功200')
break
else:
print('请求失败,重试')
time.sleep(2)
content = response.text
# print(content)
try:
tree = etree.HTML(content)
# print('详情页数据',content)
name = tree.xpath('//h1[@class="shop-name"]/text()')[0]
city = self.city_name
district_list = tree.xpath('//div[@class="breadcrumb"]/a/text()')
district = ''
for i in district_list:
if '区' in i:
district = i
address = tree.xpath('//span[@itemprop="street-address"]/text()')[0].strip()
second_type =tree.xpath('//div[@class="breadcrumb"]/a/text()')[1]
if type == 0:
third_type = ''
else:
third_type = type
try:
tel = tree.xpath('//p[@class="expand-info tel"]/span[@itemprop="tel"]/text()')[0]
except:
tel = ''
comment_num = tree.xpath('//span[@id="reviewCount"]/text()')[0]
id =id
latest_time = self.get_commets_time(id)
info_list = [id,name,city,district,address,second_type,third_type,tel,comment_num,latest_time]
# 保存单个店铺详情
L.acquire()
with open('luoyang_dianping_car.csv', 'a', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(info_list)
L.release()
print('单个店铺详情保存成功')
except:
print('详情页没有数据')
# 获取最新评论的日期,供parse_detail用
def get_commets_time(self,id):
getcomments = GetComments(id)
lasttime = getcomments.get_lasttime()
return lasttime
if __name__ == '__main__':
luoyang = Luoyang()
luoyang.get_first_page()
加入jion(),优化多线程:
使得下载的数据更加的有秩序,对于服务器的压力也更加的小;
代码总链接注释掉的部分,是已经爬完的,代码只跑了剩余部分;
代码中只有这一块做了调整:
threads = []
for business in business_li:
id = re.findall(r'/shop/(\d+)', business)[0]
t = threading.Thread(target=self.parse_detail,args=(business,id,headers,type))
threads.append(t)
t.start()
for thread in threads:
thread.join()
以下是完整的代码:
#coding:utf-8
import hashlib
import time
from fake_useragent import UserAgent
import requests
# from UA import data
import json
import scylla_test
import kuai_test
from lxml import etree
import re
import csv
from shantou_links import *
import random
from cookie_parse import GetComments
import threading
L = threading.Lock()
class Luoyang:
def __init__(self):
self.city_name = '洛阳'
# 爬虫入口,开始爬取
def get_first_page(self):
ua = UserAgent()
print('全部爬虫开始工作,从后往前')
data = ['id从配件厂老城区开始','店铺名称','所属城市','行政区','地址','二级分类','三级分类','电话','点评数量','最新点评时间']
with open('luoyang_dianping_car.csv', 'a', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(data)
field = '买车卖车'
urls = self.get_target_urls()
self.start(urls)
print('完成')
# 获取所有的初始链接
def get_target_urls(self):
# beau_urls = self.get_target_url(url='http://www.dianping.com/luoyang/ch65/g34072')
# mend_urls = self.get_target_url(url='http://www.dianping.com/luoyang/ch65/g176')
match_urls = self.get_target_url(url='http://www.dianping.com/luoyang/ch65/g177')
# result = mend_urls + beau_urls + match_urls
result = match_urls
print(result)
return result
def get_target_url(self, url):
field = '买车'
while True:
headers = {
}
content = self.url_start(url,headers,field)
if content.status_code == 200 and field in content.text:
tree = etree.HTML(content.text)
urls = tree.xpath('//div[@id="region-nav"]/a/@href')
return urls
# 遍历所有的初始链接,发起请求,保存数据
def start(self,urls):
for first_url in urls[2:]:
field = '买车卖车'
# 如果不能正确响应,反复试
while True:
headers = {
}
response = self.url_start(first_url,headers,field)
print(response.status_code)
print(response.text)
if response.status_code == 200 and field in response.text:
print('返回200,页面有需求的字段')
break
else:
print('该请求失败,准备重试')
time.sleep(2)
if 'g34084' in first_url:
type = '4s'
elif 'g34085' in first_url:
type = '综合经销商'
elif 'g34086' in first_url:
type = '二手车'
else:
type = 0
self.parse_firs_full_page(response, type)
# 拿到页数;保存首页全部店铺信息,循环翻页执行,保存操作,如果没有type 那么type就传参0
def parse_firs_full_page(self,response,type=0):
tree = etree.HTML(response.text)
try:
pages = tree.xpath('//a[@class="PageLink"]/text()')[-1]
pages = int(pages)
except:
pages = 0
# 保存首页数据
# url_t = 'http://www.dianping.com/luoyang/ch65/g34085'
# if url_t not in response.url:
# print('经销商页面进来了,首页不保存')
self.one_full_page(response,type)
# 翻页继续保存
self.turn_page(response.url,pages,type)
# 翻页并且保存每页的全部店铺数据
def turn_page(self, url, pages, type):
if pages > 0:
for i in range(2, pages + 1):
time.sleep(5)
start_url = url + 'p{}'
action_url = start_url.format(i)
if i == 2:
headers = {
'Referer': url,
'Host': 'www.dianping.com'
}
else:
headers = {
'Referer': start_url.format(i - 1),
'Host': 'www.dianping.com'
}
print(headers)
field = '买车卖车'
while True:
response = self.url_start(action_url, headers, field)
print(response.status_code)
# print(response.text)
if response.status_code == 200 and field in response.text:
print('该链接请求成功')
break
else:
print('请求失败')
time.sleep(2)
self.one_full_page(response, type)
#发起一个new请求拿到响应数据
def url_start(self,url,headers,field):
while True:
try:
# 捕捉代理超时异常
times = int(time.time())
planText = "orderno=ZF20186170227TPgMj4,secret=b5dd53126b3143fba00dda5fec6b9607,timestamp={}".format(times)
md = hashlib.md5()
md.update(planText.encode('utf-8'))
content = md.hexdigest()
ua = UserAgent()
headers['User-Agent'] = ua.random
headers['Proxy-Authorization'] = 'sign={}&orderno=ZF20186170227TPgMj4×tamp={}'.format(content.upper(), times)
proxies = {'http': 'forward.xdaili.cn:80'}
response = requests.get(url, proxies=proxies, headers=headers)
return response
except:
print ('代理超时,重试.....')
# 下载保存单个列表页的全部店铺详情;
def one_full_page(self,response,type=0):
tree = etree.HTML(response.text)
business_li = tree.xpath('//div[@class="pic"]/a/@href')
headers = {
'Referer': response.url,
'Host': 'www.dianping.com'
}
print(headers)
if len(business_li) > 0:
threads = []
for business in business_li:
id = re.findall(r'/shop/(\d+)', business)[0]
t = threading.Thread(target=self.parse_detail,args=(business,id,headers,type))
threads.append(t)
t.start()
for thread in threads:
thread.join()
else:
print('该页面没有店铺')
# 解析店铺详情页,保存数据,供one_full_page用
def parse_detail(self,url,id,headers,type=0):
field = '地址'
while True:
response = self.url_start(url,headers,field)
print(headers)
if response.status_code == 200 and len(response.text)>0 and field in response.text:
print('请求成功200')
break
else:
print('请求失败,重试')
time.sleep(2)
content = response.text
# print(content)
try:
tree = etree.HTML(content)
# print('详情页数据',content)
name = tree.xpath('//h1[@class="shop-name"]/text()')[0]
city = self.city_name
district_list = tree.xpath('//div[@class="breadcrumb"]/a/text()')
district = ''
for i in district_list:
if '区' in i:
district = i
address = tree.xpath('//span[@itemprop="street-address"]/text()')[0].strip()
second_type =tree.xpath('//div[@class="breadcrumb"]/a/text()')[1]
if type == 0:
third_type = ''
else:
third_type = type
try:
tel = tree.xpath('//p[@class="expand-info tel"]/span[@itemprop="tel"]/text()')[0]
except:
tel = ''
comment_num = tree.xpath('//span[@id="reviewCount"]/text()')[0]
id =id
latest_time = self.get_commets_time(id)
info_list = [id,name,city,district,address,second_type,third_type,tel,comment_num,latest_time]
# 保存单个店铺详情
L.acquire()
with open('luoyang_dianping_car.csv', 'a', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
writer.writerow(info_list)
L.release()
print('单个店铺详情保存成功')
except:
print('详情页没有数据')
# 获取最新评论的日期,供parse_detail用
def get_commets_time(self,id):
getcomments = GetComments(id)
lasttime = getcomments.get_lasttime()
return lasttime
if __name__ == '__main__':
luoyang = Luoyang()
luoyang.get_first_page()
每分钟8.4条数据,折合每天(24小时)12096条数据,速度确实慢到爆;这还是开了15个子线程的
需要注意的是:
子线程涉及到操作全局变量,进行修改赋值操作的时候,一定要加上锁;因为对于赋值和修改,同样是异步的,第一个子线程可能才刚刚修改完2全局变量(赋值还没进行),第二个子线程就过来修改了,这个时候全局变量得到的值和预期值肯定是不一样的;