大体思路:
- 分析糗百url格式
https://www.qiushibaike.com/8hr/page/2/
,发现需要变动的就是page后面的数字,故应该将该占位符的页码值进行动态赋值- 封装三个函数,功能分别为下载get_html()、保存save_html()、主程序运行main()
- 涉及模块知识点:
urllib.request.Request类的使用
,urllib.request.urlopen方法的使用
代码如下:
-- coding: utf-8 --
# @author : JackYoung
# @date : 2018-03-29 16:24:03
# @email : chuzhu2010@163.com
# @link : http://www.jianshu.com/u/05b01810625
import os
from urllib.request import Request
from urllib.request import urlopen
import random
def get_html(url, filename):
'''
:param url: 要下载的url
:param filename: 将要保存的文件名
:return: html 下载的内容
'''
print("正在下载" + filename)
User_Agent_list = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.104 Safari/537.36 Core/1.53.4882.400 QQBrowser/9.7.13059.400", ]
header = {
"User-Agent": random.choice(User_Agent_list)
}
# 构造Request对象
request = Request(url, headers=header)
# 发送请求
response = urlopen(request)
return response.read()
def save_html(html, filename):
'''
:param html: 保存的内容
:param filename: 保存的文件名
:return:
'''
print("正在保存" + filename)
base_dir = os.path.join(os.getcwd(), '糗事百科')
file_path = os.path.join(base_dir, filename)
with open(file_path, "wb") as f:
f.write(html)
def main(num):
'''
:param num: 要获取的糗百总页数
:return:
'''
# 构造基本url,动态赋值
base_url = "http://www.qiushibaike.com/8hr/page/{}/"
# 遍历页面,分别赋值
for i in range(num):
pn = i + 1
# 拼接参数
url = base_url.format(pn)
print(url)
# 拼接文件名
filename = "糗事百科第" + str(pn) + "页.html"
# 获取页面
html = get_html(url, filename)
# 保存页面
save_html(html, filename)
print("下载完成")
if __name__ == "__main__":
num = input("请输入爬取糗百的总页数:")
main(int(num))
运行结果如下:##