#!/usr/bin/env python
# encoding: utf-8
"""
Python: 3.6 转战python3,感觉良好,再也不用担心编码问题了oy
Author: ISeeMoon
Software: PyCharm
File: renren.py
Time: 2018/1/20 13:07
"""
import re
import os
import json
import requests
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
#手动输入用户名和密码来登陆并获取cookies
login_account = input("请输入你的用户名(以Enter键结束):")
password = input("请输入你的密码(以Enter键结束):")
#chrome版本为63.0,chromedriver.exe版本为v2.34
#1.chrome版本与chromedriver版本对应关系表见网址:
# http://blog.csdn.net/huilan_same/article/details/51896672
#2.chromedriver下载地址见:
# http://chromedriver.storage.googleapis.com/index.html
#3.将下载的chromedriver放到C盘下即可
chromepath = r"C:\chromedriver.exe"
cookie_dic = {}
chrome_opt = webdriver.ChromeOptions()
prefs={"profile.managed_default_content_settings.images":2}
chrome_opt.add_experimental_option("prefs",prefs)
browser = webdriver.Chrome(chromepath,chrome_options=chrome_opt)
browser.get('http://www.renren.com/')
wait = WebDriverWait(browser,3)
login = wait.until(
EC.presence_of_element_located((By.XPATH,"//input[@name='email']"))
)
login.send_keys(login_account)
pwd = wait.until(
EC.presence_of_element_located((By.XPATH,"//input[@id='password']"))
)
pwd.send_keys(password)
browser.find_element_by_xpath("//form[@id='loginForm']/dl[@class='savepassword clearfix']/dt/label[@class='labelCheckbox']/input[@id='autoLogin']").click()
browser.find_element_by_xpath("//form[@id='loginForm']/dl[@class='bottom']/input[@id='login']").click()
while 'ln_uact' not in cookie_dic.keys():
cookies = browser.get_cookies()
print('登陆Cookies获取完毕...')
# 将selenium获取的cookies格式转换为requests所识别的格式
for i in cookies:
cookie_dic[i['name']] = i['value']
print('登陆Cookies获取完毕,准备开始抓取相片...')
headers = {'Host':'photo.renren.com',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
headers1 = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36'}
user_id = cookie_dic['id']
albumlist_url = "http://photo.renren.com/photo/{}/albumlist/v7#".format(user_id)
albumInfo = []
def get_albumInfo():
#获取每个相册的名称和url
req = requests.get(albumlist_url,headers=headers,cookies=cookie_dic)
res = req.text
pattern = ''''albumList':\s(.*?]),'''
albumlist = eval(re.findall(pattern,res)[0])
for album in albumlist:
dic = {}
dic['albumName'] = album['albumName']
albumID = album['albumId']
dic['albumURL'] = "http://photo.renren.com/photo/{}/album-{}".format(user_id, albumID)
albumInfo.append(dic)
print('相册信息获取完毕...')
def get_photoURL(albumInfo):
for album in albumInfo:
print('开始获取相册:{} 中的照片信息...'.format(album['albumName']))
albumURL = album['albumURL']
# albumURL = 'http://photo.renren.com/photo/238355337/album-464515082/v7'
albumName = album['albumName']
res_photo = requests.get(albumURL,headers=headers,cookies=cookie_dic).text
pattern = "'photoList':(.*?]),"
photojson = json.loads(re.findall(pattern,res_photo)[0])
photoList = []
for photo in photojson:
photoURL = photo['url']
photoList.append(photoURL)
album['photoList'] = photoList
print('相册:{} 照片信息获取完毕...'.format(album['albumName']))
def download_photo(albumInfo):
print('在C盘创建人人网文件夹...')
dir_path = r"C:\人人网"
if os.path.exists(dir_path) == False:
os.mkdir(dir_path)
for album in albumInfo:
albumpath = os.path.join(dir_path,album['albumName'])
os.mkdir(albumpath)
print('创建-{}-相册'.format(album['albumName']))
for album in albumInfo:
#筛除空相册
if len(album['photoList']) != 0:
for i in range(len(album['photoList'])):
src = (album['photoList'][i])
photopath = '{}\{}\{}.jpg'.format(dir_path,album['albumName'],i)
with open(photopath,'wb') as f:
print(photopath, src)
try:
f.write(requests.get(src,headers=headers1,timeout=15).content)
except:
#记录下载失败的相片信息,并记录在人人网文件夹下的下载失败.txt文件里
print(album['albumName']+'→相册下的相片:'+src+' 下载失败!')
error_lst.append(src)
with open('c:\\人人网\下载失败.txt', 'w') as f:
for i in error_lst:
f.write(str(error_lst.index(i)) + ' ' + i + '\n')
def main():
get_albumInfo()
get_photoURL(albumInfo)
download_photo(albumInfo)
if __name__ == '__main__':
main()
Python爬虫代码--人人网照片下载
最后编辑于 :
©著作权归作者所有,转载或内容合作请联系作者
- 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
- 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
- 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
推荐阅读更多精彩内容
- 我的博客:http://www.wangs0622.com 参考书籍:《用 Python 写网络爬虫》 下载地址:...
- 起因:前两天在公众号上看到一篇文章内容就是爬取王者荣耀的皮肤,但是内容太大概了,如果跟着他做肯定做不出来,所以我打...
- 一、一个基本爬虫框架主要包括五大模块:爬虫调度器,URL管理器,HTML下载器,HTML解析器,数据存储器。 UR...
- 1.sqlalchemy与mysql的连接 1.创建数据库 2.准备连接接数据库的数据 3.DA_URI的参考格式...