最近用Python写了一个新浪微博相册的批量下载程序,选好用户后运行就可以把他相册的全部相片下载到本地。记录了下载历史,可以中途停止和续传。
新浪微博浏览相册需要登录,在尝试了模拟登录之后,在登录机制和验证码上纠结了很久,最后我选择了直接用cookie登录,这样其实更省事。
Python2.7,应该就用了requests这个第三方库,API用起来更舒服。
pip install requests
使用的时候:
1.先打开该用户的微博页面,F12或者查看源代码,找到他的page_id,填到程序的uid处。
2.用F12或者其他监听软件找到cookies,填入程序cookies处。
3.把希望保存的本地目录路径填入程序中dirpath处。
下面是完整代码
# coding=u8
#作者:平仄_pingze (简书)
"功能"
'''
获取新浪微博用户相册照片到本地
'''
"使用方法"
'''
1.填写储存目录
2.指定微博用户id
3.填写cookie
4.运行
'''
# ---|| 初始参数,需要预先填写 ||---
dirpath = '' #储存目录
uid = #用户page_id
cookies = '' #cookies
import os
import requests
import urllib
import re
from StringIO import StringIO
import pickle
import traceback
import time
def list_find(alist,ele):
'不报错的list.index()'
try:
return alist.index(ele)
except:
return -1
def get_response(url,headers='',params=''):
'稳定高效的获取响应方法'
max_try_times = 20 # 最大尝试次数
wait_time = 0.75 # 最大单次尝试时间
sleep_time = 0.25 # 尝试失败延时
#print('[%s][INFO] Start trying to connect ...' % time.asctime()[11:19])
for times in range(1,max_try_times+1):
# print('[%s][INFO] The %s time try begin ...' % (time.asctime()[11:19], times))
try:
response = requests.get(url, timeout = wait_time, headers=headers, params=params)
# print('[%s][INFO] The %s time try success!' % (time.asctime()[11:19], times))
break
except:
if times < max_try_times:
# print('[%s][WARN] The %s time try failed!' % (time.asctime()[11:19], times))
time.sleep(sleep_time)
continue
else:
print('[%s][ERROR] The last try failed at last , exit pro ...' % time.asctime()[11:19])
traceback.print_exc()
exit()
# print('[%s][INFO] Successfully get the response!' % time.asctime()[11:19])
return response
def retrieve(imgurl,imgpath):
'稳定高效的下载图片方法(多次尝试失败后跳过)'
max_try_times = 5 # 最大尝试次数
wait_time = 15 # 最大单次尝试时间
sleep_time = 3 # 尝试失败延时
import socket
socket.setdefaulttimeout(wait_time)
#print('[%s][INFO] Start trying to connect ...' % time.asctime()[11:19])
for times in range(1,max_try_times+1):
# print('[%s][INFO] The %s time try begin ...' % (time.asctime()[11:19], times))
try:
urllib.urlretrieve(imgurl,imgpath)
# print('[%s][INFO] The %s time try success!' % (time.asctime()[11:19], times))
break
except:
if times < max_try_times:
# print('[%s][WARN] The %s time try failed!' % (time.asctime()[11:19], times))
time.sleep(sleep_time)
continue
else:
print('[%s][ERROR] The last try failed at last , pass ...' % time.asctime()[11:19])
break
# print('[%s][INFO] Successfully get the response!' % time.asctime()[11:19])
def secp(string,pattern1,pattern2=''):
'替换字符串中所有指定字符串为新字符串(效率低)'
while True:
index = string.find(pattern1)
if index > -1:
string = string[:index]+pattern2+string[index+len(pattern1):]
else:
break
return string
def url_deal(url):
'URL处理'
urld = secp(url,'\\')
urld = secp(urld,'thumb300','large')
return urld
def get_imgurl(html):
'解析html,获取图像url列表'
imgurl_list = []
extlist = ['jpg','gif','png']
for ext in extlist:
pattern = r'class=\\\"photo_pict\\\" src=\\\"(http:\S+thumb300\S+.'+ext+')'
result = re.findall(pattern,html,re.S)
if len(result) > 0:
for url in result:
imgurl_list.append(url_deal(url))
return imgurl_list
def save_img(imgurl,savepath,imgname):
'向本地目录储存图像'
imgext = imgurl[-4:]
imgname = imgname + imgext
retrieve(imgurl,savepath+os.sep+imgname)
def save_log(dic, path):
'以pickle文件格式储存到目标路径'
try:
out_file = open(path, 'wb')
pickle.dump(dic,out_file)
return path
except:
traceback.print_exc()
return None
finally:
out_file.close()
def load_log(path):
'从指定文件读取pickle文件转成字典'
try:
in_file = open(path, 'rb')
dic = pickle.load(in_file)
return dic
except:
traceback.print_exc()
return None
def main():
url = 'http://www.weibo.com/p/'+str(uid)+'/photos'
headers = {
'Cookie': cookies
}
#访问网址,获取html文档
response = get_response(url, headers=headers)
print('[%s][INFO] Pro starting at %s ...' % (time.asctime()[11:19], response.url))
html = response.text
#检查html是否有效;若无效,报错并中止
if len(re.findall('thumb300',html,re.S)) < 1 and len(re.findall('oid',html,re.S)) < 1 and len(re.findall('的微薄',re.S)) < 1:
print('[%s][ERROR] Invalid cookies or page_id, please check !' % (time.asctime()[11:19]))
exit()
#解析文档,获取用户信息和图片路径
uname = re.findall(u'content="(.+?),',html,re.S)[0]
imgurl_list = get_imgurl(html)
#动态获取循环
while True:
#获取since_id,进一步获取动态加载的页面
result = re.findall('since_id=(\S+)">',html,re.S)
if len(result)>0:
since_id = result[0][:-1]
else:
break
#print(since_id)
payload={
'since_id': since_id,
'page_id': uid,
'ajax_call': 1
}
url = 'http://weibo.com/p/aj/album/loading'
response = get_response(url,params=payload,headers=headers)
html = response.text
print('[%s][INFO] Got new page of %s !' % (time.asctime()[11:19], response.url))
#解析文档,获取html路径
imgurl_list = imgurl_list + get_imgurl(html)
savepath = dirpath + os.sep + uname
if(os.path.exists(savepath)==False or os.path.isdir(savepath)==False):
os.mkdir(savepath)
imgurl_list.reverse()
global total_num
total_num = len(imgurl_list)
#log文件存在性检查
logpath = savepath + os.sep + 'log.pkl'
if os.path.exists(logpath) and os.path.isfile(logpath):
print('[%s][INFO] Found log.pkl, loading...' % (time.asctime()[11:19]))
logdic = load_log(logpath)
log_last_num = logdic.get('last_num')
log_imgurl_list = logdic.get('imgurl_list')
index = log_last_num + 1
else:
print('[%s][INFO] Not found log.pkl, creating a new one ...' % (time.asctime()[11:19]))
log_imgurl_list = []
index = 1
#开始下载图片
num = 1
for imgurl in imgurl_list:
if list_find(log_imgurl_list, imgurl) < 0:
imgname = '{:0>5}'.format(index)
save_img(imgurl, savepath, imgname)
index = index + 1
last_num = index - 1
log_imgurl_list.append(imgurl)
logdic = {
'last_num': last_num,
'imgurl_list': log_imgurl_list
}
print('[%s][INFO] Writing log ... (%d/%d) !' % (time.asctime()[11:19], num, total_num))
save_log(logdic, logpath)
print('[%s][INFO] Successfully saved image as %s (%d/%d) !' % (time.asctime()[11:19], imgname, num, total_num))
else:
print('[%s][INFO] Jump this image (%d/%d) !' % (time.asctime()[11:19], num, total_num))
num = num + 1
if __name__ == '__main__':
main()
比如我的初始参数是:
dirpath = 'images' #与脚本同目录的images文件夹
uid = 1035051191258123 # 韩寒
cookies = 'SINAGLOBAL=221.237.83.131_146556……' #很长,不给你看
套路是这么个套路,大家有什么想法可以提一提嘛……