cookie是保存账号和密码的地方,可以尝试通过运用cookie登录所想要的网站,然后进行爬取页码,一般cookie有效期半个小时,而url可能会有好几天,按照前几节课的方法进行cookie登录会被挡在外面,因此需要用到cookiejar
#导入cookiejar
import http.cookiejar
import urllib.parse
import urllib.request
#先定制一个cookie对象
#注意后面区分大小写
cookie = http.cookiejar.CookieJar()
handler = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(handler)
post_url ="http://www.renren.com/ajaxLogin/login?1=1&uniqueTimestamp=20191131847172 "
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36"
#既然是Post的请求,就有表单数据的传递
}
data = {
"email":"zj1028hlju@sohu.com",
"icode":"",
"origURL":"http://www.renren.com/home",
"domain":"renren.com",
"key_id":"1",
"captcha_type":"web_login",
"password":"3ef61cfeffe709bcbc0afcce4e752af3a4190d57c1f4918f81c814a694c676e4",
"rkey":"aed6ec6603b6e6c14f782872bcdf6cc3",
"f":"http%3A%2F%2Fwww.renren.com%2F973165373%2Fnewsfeed%2Fphoto",
}
data = urllib.parse.urlencode(data).encode("utf-8")
p_req = urllib.request.Request(data=data,headers=headers,url=post_url)
#下面的opener已经存储了cookie了,所以可以直接请求主页
login_req = opener.open(p_req)
#因为已经使用了opener了,所以这里不需要构建header和data了
owner_url ="http://www.renren.com/317141561/profile"
owner_req = urllib.request.Request(url=owner_url,headers=headers)
res = opener.open(owner_req)
content = res.read().decode("utf-8")
with open("rr20.html","w",encoding="utf-8")as fp:
fp.write(content)