总目录:https://www.jianshu.com/p/e406a9bc93a9
Python-爬虫 - 子目录:https://www.jianshu.com/p/23cf57674bf1
官方手册:http://2.python-requests.org/zh_CN/latest/user/quickstart.html
requests是python实现的最简单易用的HTTP库,建议爬虫使用requests
requests是在urllib库上扩展的第三方库
概述
实例
import requests
#发起GET请求
url ="https://www.python.org/"
response=requests.get(url=url)
# 查看响应类型requests.models.Response
print(response)
<Response [200]>
print(type(response))
<class 'requests.models.Response'>
# 输出状态码
print(response.status_code)
200
# 输出响应内容类型
print(type(response.content))
<class 'bytes'>
print(type(response.text))
<class 'str'>
# 输出响应内容
print(response.content) #获取源码的字节流格式,一般用于获得图片和视频
print(response.text) #获取源码的文本格式(如要对获取源码进行分析用这个)
# 输出编码格式
print(response.encoding)
UTF-8
# 输出cookies
print(response.cookies)
<RequestsCookieJar[]> #没有cookies
各种请求方式
# 发起POST请求
requests.post('http://httpbin.org/post')
# 发起PUT请求
requests.put('http://httpbin.org/put')
# 发起DELETE请求
requests.delete('http://httpbin.org/delete')
# 发送HEAD请求
requests.head('http://httpbin.org/get')
# 发送OPTION请求
requests.options('http://httpbin.org/get')
请求
get带参请求
下面说get带参请求的三种方式:
?+参数(?key1=value1)
import requests
response1 = requests.get("http://httpbin.org/get?key1=value1")
print(response1.url)
http://httpbin.org/get?key1=value1
params参数
import requests
parameter = {
"key1":"value1",
"key2":"value2"
}
response2 = requests.get("http://httpbin.org/get",params = parameter)
print(response2.url)
http://httpbin.org/get?key1=value1
还可以将一个列表作为值传入
import requests
parameter = {
"key1":"value1",
"key2":["value21","value22"]
}
response3 = requests.get("http://httpbin.org/get",params = parameter)
print(response3.url)
http://httpbin.org/get?key1=value1&key2=value21&key2=value22
注意字典里值为 None 的键都不会被添加到 URL 的查询字符串里。
import requests
parameter = {
"key1":"value",
"key2":None
}
response4 = requests.get("http://httpbin.org/get",params = parameter)
print(response4.url)
http://httpbin.org/get?key1=value
post带参请求
传递一个字典类型数据
import requests
payload = {
"key1":"value1",
"key2":"value2"
}
response = requests.post("http://httpbin.org/post",data = payload)
print(response.text)
传递一个元组类型数据
import requests
payload = (("key1","value1"),("key1","value2"))
response = requests.post("http://httpbin.org/post",data = payload)
print(response.text)
解析json
import requests
response = requests.get('http://httpbin.org/get')
# 获取响应内容
print(response.text)
# 如果响应内容是json,就将其转为json
print(response.json())
# 输出的是字典类型
print(type(response.json()))
定制请求头
import requests
#以知乎为例子
response =requests.get("https://www.zhihu.com")
print(response.text)#报错
new_headers={
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36 SE 2.X MetaSr 1.0'
}
response = requests.get("https://www.zhihu.com",headers = new_headers)
print(response.text)#正常输出
基本POST请求格式
import requests
# 设置传入post表单信息
data= {'name':'hanxuan', 'age':20}
# 设置请求头信息
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36"
}
# 设置请求头信息和POST请求参数(data)
response = requests.post('http://httpbin.org/post', data=data, headers=headers)
print(response.text)
响应
获得响应属性
import requests
response = requests.get('http://www.jianshu.com/')
# 获取响应状态码
print(type(response.status_code),response.status_code)
# 获取响应头信息
print(type(response.headers),response.headers)
# 获取响应头中的cookies
print(type(response.cookies),response.cookies)
# 获取访问的url
print(type(response.url),response.url)
# 获取访问的历史记录
print(type(response.history),response.history)
高级操作
文件上传
import requests
files = {'file': open('s1.ico', 'rb')}
# 往POST请求头中设置文件(files)
response = requests.post('http://httpbin.org/post', files=files)
print(response.text)
获取cookies
import requests
response = requests.get('http://www.baidu.com')
print(response.cookies)
for key, valuein response.cookies.items():
print(key, ':', value)
<RequestsCookieJar[<Cookie BDORZ=27315 for .baidu.com/>]>
BDORZ : 27315
会话维持
import requests
session = requests.session()
# 使用seesion去请求保证了请求是同一个session
session.get('http://httpbin.org/cookies/set/number/12456')
response = session.get('http://httpbin.org/cookies')
print(response.text)
设置代理
普通的代理设置
import requests
proxies = {
"http":"http://10.10.1.10:3128",
"https":"http://10.10.1.10:1080",
}
requests.get("http://example.org", proxies=proxies)
当然也可以带用户名和密码:
proxies={"http":"http://user:pass@10.10.1.10:3128/",}
超时设置
import requests
from requests.exceptionsimport ReadTimeout
try:
# 设置必须在500ms内收到响应,不然或抛出ReadTimeout异常
response = requests.get("http://httpbin.org/get", timeout=0.5)
print(response.status_code)
except ReadTimeout:
print('Timeout')
200
异常处理
import requests
from requests.exceptionsimport ReadTimeout, ConnectionError, RequestException
try:
response = requests.get("http://httpbin.org/get", timeout=0.5)
print(response.status_code)
except ReadTimeout:
# 超时异常
print('Timeout')
except ConnectionError:
# 连接异常
print('Connection error')
except RequestException:
# 请求异常
print('Error')