最近想练练爬虫,就写了这么一个东西。说起来,Python干这种事真是得心应手。因为我也是看了别人的文章学来的,所以就不讲解啦,只贴代码。
#!usr/bin/env python
# coding:utf-8
import requests
import os
import json
from lxml import etree
class ZhihuClient(object):
"""知乎爬虫,目前只能初步的登陆知乎"""
def __init__(self):
super(ZhihuClient, self).__init__()
self.__session = requests.Session()
self.__session.headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Host": "www.zhihu.com",
"Upgrade-Insecure-Requests": "1",
}
self.captchaFilepath = '/Users/lz/Desktop/cap.gif'
self.isLogin = False
def loadHome(self):
"""载入知乎首页"""
self.__cookie = self.__loadCookie()
if self.__cookie:
print('拥有cookie,直接登录')
self.__session.cookies.update(self.__cookie)
home = self.__session.get('http://www.zhihu.com').text
with open('home.html', 'w') as f:
f.write(home)
else:
print('缺少cookie,请登录')
username = input('请输入你的用户名:')
password = input('请输入你的密码')
self.login(username, password)
home = self.__session.get('http://www.zhihu.com').text
with open('home.html', 'w') as f:
f.write(home)
def login(self, username, password):
"""登陆知乎"""
self.setusername(username)
self.__password = password
html = self.__session.get('http://www.zhihu.com').text
html = etree.HTML(html.lower())
print(type(html))
result = html.xpath('//input[@name="_xsrf"]')
self.__xsrf = result[0].attrib['value']
self.getgif()
data = {
'_xsrf': self.__xsrf,
'password': self.__password,
'remember_me': 'true',
self.__usernametype: self.__username,
'captcha': self.__captcha
}
res = self.__session.post(
'http://www.zhihu.com/login/' + self.__usernametype, data=data)
print('=' * 50)
if res.json()['r'] == 0:
print('登陆成功')
self.__saveCookie()
else:
print('登陆失败')
print('错误信息 ----- >', res.json()['msg'])
def getgif(self):
"""获取验证码图片"""
captcha = self.__session.get(
'http://www.zhihu.com/captcha.gif').content
with open(self.captchaFilepath, 'wb') as output:
output.write(captcha)
print('=' * 50)
self.__captcha = input('请输入验证码:')
os.remove(self.captchaFilepath)
def setusername(self, username):
"""检测登陆类型"""
self.__username = username
if username.isdigit():
self.__usernametype = 'phone_num'
else:
self.__usernametype = 'email'
def loadUrl(self, url):
"""下载传入的url指向页面中所有答案的图片"""
htmlS = self.__session.get(url).text
html = etree.HTML(htmlS.lower())
result = html.xpath('//span[@class="zm-editable-content"]')
name = result[0].text
path = name + '/'
os.mkdir(name)
if os.path.exists(name):
with open(path + name + '.html', 'w') as f:
f.write(htmlS)
print('目标网页获取成功')
result = html.xpath(
'//img[@class="origin_image zh-lightbox-thumb"]')
imgs = []
for ele in result:
imgs.append(ele.attrib["src"])
self.downloadImg(imgs, path)
def downloadImg(self, imgs, path):
"""通过传入的imgs链接,下载图片"""
length = len(imgs)
for i in range(0, length):
url = imgs[i]
print(url)
img = requests.get(url).content
with open(path + '%d.png' % i, 'wb') as f:
f.write(img)
def __saveCookie(self):
"""储存cookie"""
with open('zhihucookie', 'w') as f:
json.dump(self.__session.cookies.get_dict(), f)
print('=' * 50)
print('已经生成cookiew文件')
def __loadCookie(self):
"""加载cookie"""
if os.path.exists('zhihucookie'):
print('=' * 50)
with open('zhihucookie', 'r') as f:
cookie = json.load(f)
self.isLogin = True
return cookie
return None
if __name__ == '__main__':
client = ZhihuClient()
if not client.isLogin:
print('我要登录')
client.loadHome()
url = input('请输入网址:')
client.loadUrl(url)