- 使用的技术:selenium,xpath,css_selector, re
- 欢迎讨论
# coding=utf-8
'''
使用selenium爬取猫眼电影
名称评分上映时间
'''
import json
import random
import re
import time
from selenium.webdriver import Chrome, ActionChains
# selenium防反爬
from selenium.webdriver import ChromeOptions
options = ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
options.add_experimental_option('useAutomationExtension', False)
driver = Chrome(options=options)
driver.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
driver.implicitly_wait(30)
# 功能函数
def filterScore(origin):
'''
把原始评分数据处理成可用数据
传入原始评分数据列表
:return:
'''
new = []
for i in origin:
if len(i) > 10:
i = i[19:21] + i[-5:-4]
new.append(i)
return new
# 先进行登录,以防爬取过程中跳出登录页面
url_ = 'https://maoyan.com/'
driver.get(url_)
ActionChains(driver).move_to_element(driver.find_element_by_xpath('/html/body/div[1]/div/div[3]/div')).perform()
driver.find_element_by_xpath('/html/body/div[1]/div/div[3]/div/ul/li/a').click() # 点击立即登录
driver.find_element_by_xpath('//*[@id="login-email"]').send_keys('账号')
driver.find_element_by_xpath('//*[@id="login-password"]').send_keys('密码')
driver.find_element_by_xpath('//*[@id="user-agreement-wrap-text-circle"]/i').click()
driver.find_element_by_xpath('//*[@id="J-normal-form"]/div[5]/input[5]').click()
time.sleep(5)
print('登录成功')
# 开始正式爬取
url_ = 'https://maoyan.com/films?showType=3&offset=0' # 猫眼电影的starturl
driver.get(url_)
driver.implicitly_wait(30)
time.sleep(10) # 用于手动处理验证码
# 循环爬取数据
for i in range(5):
text = driver.page_source
print(text)
# 用xpath无法解析出数据,改用re
titleLIst = re.findall('data-act="movies-click" data-val="{movieId:.*?}">(.*?)</a>', text)
originScoreList = re.findall('<div class="channel-detail channel-detail-orange">(.*?)</div>', text)
scoreList = filterScore(originScoreList)
upTimeList = [i.strip() for i in re.findall('<span class="hover-tag">上映时间:</span>(.*?)</div>', text.replace('\n', ''))]
# upTimeList = re.findall('\d{4}-\d{2}-\d{2}', text)
print(titleLIst, len(titleLIst))
print(scoreList, len(scoreList))
print(upTimeList, len(upTimeList))
# 模拟滚动
distance_ = 0
for i in range(10):
driver.execute_script(f'window.scrollTo(0,{distance_})')
distance_ += 100
time.sleep(0.2)
# 持久化
for i in range(len(titleLIst)):
dict = {}
dict['title'] = titleLIst[i]
dict['score'] = scoreList[i]
dict['upTime'] = upTimeList[i]
with open('1.json', 'a', encoding='utf-8')as f:
f.write(json.dumps(dict, ensure_ascii=False) + ',\n')
time.sleep(random.choice([1,2,1.5]))
driver.find_element_by_css_selector('#app > div > div.movies-panel > div.movies-pager > ul > li:last-child > a').click() # 点击下一页