在进行接口测试等工作中,很多页面访问需要权限,这些权限管理字段一般存储于header里面,已cookie、token等形式存在,因此经常需要在url1中发送请求获得这些许可证,在后面的接口测试中利用许可证构造header继续进行请求以下是一个举例,是构造header在爬虫中的应用。
coding=utf-8
http://699pic.com/download/getDownloadUrl----获取下载URL地址的接口接收pid:500472407
返回包含图片链接URL=http://down.699pic.com/photo/50047/2407.jpg?_upt=63305cd11514965673&_upd=500472407.jpg
upd分解成两块填入 _upt为实时生成,具有时间使用限制 #图片id
如何获得_upd?
http://699pic.com/sousuo-61847-0-1-0-0-0.html第一页#keyword 如何转化为61847---页面代码中可以找到
http://699pic.com/sousuo-61847-0-2-0-0-0.html第二页
http://699pic.com/sousuo-61847-0-3-0-0-0.html第三页
多页面获取只要改变 这个数字,数字在html代码中可寻找到
_upd在网页元素中html代码中可以找到拼接即可
问题转化为keyword---五位数字的对应关系如何生成的
import requests
import time
import multiprocessing#多进程
from bs4 import BeautifulSoup#用于处理html文本,可以树状解析,方便查找和拆分
import sys
import io
from urllib import request#用于模拟登陆请求,携带登陆cookie信息进行访问
import json
import os
import random
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8')#改变标准输出的默认编码
class SpiderForPicture(object):
author = 'Blokks'
def __init__(self, keyword):
self.keyword = keyword
def saving_folder_making(self):
folder_path = 'F:\\test_auto\\spider\\pictures\\' + self.keyword
if not os.path.exists(folder_path):
os.mkdir(folder_path)
print('创建名字为%s的目录存放图片' % self.keyword)
return folder_path
def get_page_count(self):
try:
keyword = self.keyword
url = 'http://699pic.com/tupian/' + keyword + '.html'
html = requests.get(url)
content = (html.content).decode('utf-8')
re_1 = BeautifulSoup(content, "lxml")
re_2 = re_1.find_all(name='div', attrs={'class': 'pager-linkPage'})
re_3 = re_2[0].find_all(name='a')
list_ = []
list_result = []
result_dict = {}
for item in re_3:
ls = (item.get('href')).split('-')
list_.append(ls)
list_result.append(int(ls[3]))
page_count = str(max(list_result))
key_number = str(list_[0][1])
result_dict[key_number] = page_count
return result_dict#取得对应数字kw和页码数
except:
print('搜索关键字未找到图片...')
exit(1)
def get_pic_id(self):
pic_id_list = []
kw_dict = self.get_page_count()
list_ = []
for i in kw_dict:
list_.append(i)
list_.append(kw_dict[i])
page_count = list_[1]
print('根据关键字%s一共搜索到图片%s张' % (self.keyword, page_count))
key_number = list_[0]
for num in range(1, int(page_count)+1):
url = 'http://699pic.com/sousuo-'+key_number+'-0-'+str(num)+'-0-0-0.html'
html = requests.get(url)
content = (html.content).decode('utf-8')
re_1 = BeautifulSoup(content, "lxml")
re_2 = re_1.find_all(name='div', attrs={'class': 'list'})
for item in re_2:
pic_id_list.append(item.get('data-id'))
for i in pic_id_list:
if len(str(i)) < 9:
pic_id_list.remove(i)
return pic_id_list
def get_download_url(self):
pic_id_list = self.get_pic_id()
url_pool = []
for pic_id in pic_id_list:
url = 'http://699pic.com/download/getDownloadUrl?pid=' + pic_id
cookie_str = r'2017endalert=1; uniqid=5a4c7bd11a363; bargain_popup=1; uv_cookie=c610bdc8d6965b2e7abec5d93' \
r'd07ad59; is_click_activity=1; from_data=YTo1OntzOjQ6Imhvc3QiO3M6MTA6IjY5OXBpYy5jb20iO3M6Mzoi' \
r'c2VtIjtiOjA7czoxMDoic291cmNlZnJvbSI7aTowO3M6NDoid29yZCI7TjtzOjM6ImtpZCI7aTowO30%3D; isVip=0; ' \
r'isPay=0; is_qy_vip=1; is_join_2017_end_18454014=0; isSearch=0; s_token=03e987b8c9b7912d89e77b' \
r'b7fd9b62e8; PHPSESSID=kt1v9k8sid51kg0ej6e127cvkvgmpc7q; Qs_lvt_135734=1513923395%2C1513923542' \
r'%2C1514961873%2C1515026629%2C1515031146; mediav=%7B%22eid%22%3A%22278616%22%2C%22ep%22%3A' \
r'%22%22%2C%22vid%22%3A%22%5EySs)9Ku%25D%3A*qX%24(Pe%3FD%22%2C%22ctn%22%3A%22%22%7D; ' \
r'Hm_lvt_1154154465e0978ab181e2fd9a9b9057=1515026630,1515026702,1515031028,1515031147; ' \
r'Hm_lvt_ddcd8445645e86f06e172516cac60b6a=1515026629,1515026702,1515031028,1515031147; ' \
r'recentlysearch=YTo0OntpOjA7YToyOntzOjI6Imt3IjtzOjc6ImRpYW5uYW8iO3M6NjoicGlueWluIjtzOjY6IjMx' \
r'MTExMCI7fWk6MTthOjI6e3M6Mjoia3ciO3M6Njoi55S16ISRIjtzOjY6InBpbnlpbiI7czo3OiJkaWFubmFvIjt9aTo' \
r'yO2E6Mjp7czoyOiJrdyI7czoxMjoi5pm66IO95a625bGFIjtzOjY6InBpbnlpbiI7czoxMjoiemhpbmVuZ2ppYWp1Ij' \
r't9aTozO2E6Mjp7czoyOiJrdyI7czo2OiLlpKfmtbciO3M6NjoicGlueWluIjtzOjU6ImRhaGFpIjt9fQ%3D%3D; ' \
r'search_Kw=%22diannao%22; is_join_2017_end_533435=0; Qs_pv_135734=144824772440290620%2C38906' \
r'64247893633500%2C3737559667568741000%2C2243149228815513300%2C1985644855545767200; ' \
r'Hm_lpvt_1154154465e0978ab181e2fd9a9b9057=1515034556; Hm_lpvt_ddcd8445645e86f06e172516cac60' \
r'b6a=1515034556; redirect=http%3A%2F%2F699pic.com%2Ftupian-500472175.html; session_data=YTo1' \
r'OntzOjM6InVpZCI7czo2OiI1MzM0MzUiO3M6NToidG9rZW4iO3M6MzI6ImZkZDIyZWY5NDJlMjY3NjViYTdhMGE2NmY' \
r'4NzVmMTE3IjtzOjM6InV1dCI7czozMjoiMWM0Y2E4ZDZmMDRhYTdhYmJiNTNkNTkwZmI4MGJiMWMiO3M6NDoiZGF0YS' \
r'I7YToxOntzOjg6InVzZXJuYW1lIjtzOjEyOiLku5nlpbPlprnlprkiO31zOjY6ImV4dGltZSI7aToxNTE1NjM5MzgzO' \
r'30%3D; uid=533435; username=%E4%BB%99%E5%A5%B3%E5%A6%B9%E5%A6%B9; head_pic=http%3A%2F%2' \
r'Fq.qlogo.cn%2Fqqapp%2F101268598%2FD2C2DF0668D1C9B957ADD345B9B7A420%2F40; login_user=1'
req = request.Request(url)
req.add_header('Cookie', cookie_str)
req.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36')
resp = request.urlopen(req)
result_ = resp.read().decode('utf-8')
result_dict = json.loads(result_)
if not 'url' in result_dict:
print('cookies失败o(╥﹏╥)o')
exit(1)
download_url = result_dict['url']
url_pool.append(download_url)
return url_pool
def download_picture(self, url):
file_name = self.keyword + str(random.randint(100000, 999999)) + '.jpg'
folder_path = self.saving_folder_making()
file_path = folder_path + '\\' + file_name
resp = requests.get(url)
content = resp.content
with open(file_path, 'wb') as f:
f.write(content)
def main():
start_time = time.time()
keyword = input('请输入需要搜索的关键字(拼音):')
spider = SpiderForPicture(keyword)
middle_time = time.time()
time_cost = middle_time - start_time
url_pool = spider.get_download_url()
print('下载地址解析完毕---用时%s---现在开始下载....' % time_cost)
p = multiprocessing.Pool(processes=4)
p.map(spider.download_picture, url_pool)
p.close()
p.join()
end_time = time.time()
time_used = end_time - start_time
print('全部下载完毕,用时%s' % time_used)
if name == 'main':
main()
可以看到上面代码中cookie贼长,在本例子中使用add_header进行请求头构造完成后面的请求需要。