本章教学爬取腾讯动漫,实现腾讯动漫爬取
爬取腾讯动漫和普通图片大不一样,腾讯的反爬虫机制比较多,主要克服问题:
1.Javascript动态加载
2.图片异步加载
实现代码的基本思路
1找出所要下载腾讯动漫章节的url
2生成需要爬取N个章节的url
3打开每一章节url,获取里面的数据,找出里面的所有的动漫图片地址(这里解决Javascript动态加载、图片异步加载问题)
4下载所有动漫图片
下面是具体的代码实现
from selenium import webdriver #解决图片异步加载的包
import time
import re
import os
import urllib.request
#找到动漫名称,根据动漫名创建存储的文件夹
def find_title(url):
# 这里解决JavaScript动态加载
driver = webdriver.PhantomJS()
driver.get(url)
data = driver.page_source # 获取网页文本
p = r'<title>《(.*?)》'
title = re.findall(p,data)
title = title[0]
if os.path.exists(title):
print('文件已存在')
else:
os.mkdir(title)
print('创建文件夹成功')
time.sleep(5)
return title
#1.1找到目的章节url最后一个数字,为的是获取接下来所有章节的url
def find_firstnmber(url): #还可以切割
data = url
p = r'/([0-9]+)$'
firstnmber = re.findall(p,data)
firstnmber = int(firstnmber[0])
return firstnmber
#1.2获取接下来所有章节的url
def find_front_url(url,firstnmber):
array = []
for i in url:
array.append(i)
array1 = array[:] # 正确复制
for d in range(len(firstnmber)):
del array1[len(array) - 1 - d]
str1 = ','.join(array1)
front_url = str1.replace(',', '')
return front_url
#第2点在主函数内完成
#3.1打开每一章节url,获取里面的数据
def Download_chapter_url(url,title,chapter,driver):
driver.get(url)
html_route = "./"+title+"/第"+str(chapter)+"章.html"
if os.path.exists(html_route):
print('html文件已存在,跳过爬取网页步骤')
else:
n = 15
for i in range(1, n + 1):
time.sleep(0.5)#我采用的睡眠0.5s向下滚动一次,可自行调节
#这里解决Javascript动态加载
js = 'var q=document.getElementById("mainView").scrollTop=' + str(i * 1000)
driver.execute_script(js)
data = driver.page_source # 获取网页文本
fh = open(html_route, "w", encoding="utf-8")
fh.write(data)
fh.close()
#3.2找出所章节里的图片地址
def find_chapter_pictures(title,chapter):
# 打开目的文件
filename = "./"+title+"/第"+str(chapter)+"章.html"
with open(filename, 'rb') as f:
html = f.read().decode('utf-8')
# 正则出章节标题、url
p1 = r'<title>《.*》(.*)-在线漫画'
chapter_title = re.findall(p1, html)
# print(chapter_title)
chapter_title = chapter_title[0]
p2 = r'(https://manhua.qpic.cn/manhua.*?\.jpg/0)'
imglist = re.findall(p2, html)
del imglist[0]
print('爬取到' + str(len(imglist)) + '张图片,预计耗时' + str(1 * len(imglist)) + 's')
chapter_route = './'+title+'/'+'第'+str(chapter)+'章'+chapter_title
if os.path.exists(chapter_route):
print('章节文件夹已存在,跳过图片下载')
else:
os.mkdir(chapter_route)
h = 0
for k in imglist: #k单张链接
h = h + 1 #h代表第几张图
# print(j)
download_jpg(k, h, chapter_route)#3s
print('本章下载完成')
#4下载所有动漫图片
def download_jpg(jpg_url,page,chapter_route):
req = urllib.request.Request(jpg_url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36')
response = urllib.request.urlopen(req)
jpg_html = response.read()
jpg = open(chapter_route+'/'+str(page)+'.jpg', 'wb')
jpg.write(jpg_html)
jpg.close()
print('图片'+str(page)+'下载完成')
time.sleep(1)
#主函数
if __name__ == '__main__':
url = input("请输入你想爬取的腾讯动漫网址:")
driver = webdriver.PhantomJS() # 加载模拟浏览器
sum_page = int(input("请输入你想下载的章节数:"))
# 先加载一次网页,获取动漫名,创建文件夹
title = find_title(url)#5s
firstnmber = find_firstnmber(url)
front_url = find_front_url(url,str(firstnmber))
# sum_page = 1
# 打开每章节 ,正则出章节名字,创建文件夹,
#2通过for循环生成需要爬取N个章节的url
for i in range(sum_page):
now_page = firstnmber + i
now_url = front_url + str(now_page)
chapter = i + 1
print("开始模拟浏览器,爬取本章页面代码,预计耗时8s")#这里是JavaScript代码运行时间,可自行调节
Download_chapter_url(now_url, title, chapter,driver)#将文件放入 ./title/第n章.html
print("开始下载本章图片")
find_chapter_pictures(title,chapter)#60s
driver.quit()