"""
filename: maoyantop100/spider.py
python: 3.7.0
description: 使用requests和正则爬取猫眼电影
author: master
ide: pycharm
"""
import requests
import re
import json
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36',
# 'Host': 'maoyan.com',
# 'Upgrade-Insecure-Requests': '1',
# 'Referer': 'http://maoyan.com/board',
# 'Connection': 'keep-alive',
}
def crawl_one_page(url):
# 需要User-agent标识,否则返回403
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.text
else:
return "status code :" + str(response.status_code)
def parse_one_page(content):
# 可以把要匹配的一段源码复制出来,把不需要的用.*?代替,需要的用(.*?)代替,注意唯一匹配就好
pattern = re.compile('<dd>.*?board-index-.*?">(.*?)</i>.*?'
+'data-src="(.*?)".*?class="name".*?'
+'data-val.*?>(.*?)</a>.*?'
+'star">\s*?主演:(.*?)\s*?</p>.*?'
+'releasetime">上映时间:(.*?)</p>.*?'
+'integer">(.*?)</i><i class="fraction">(.*?)</i></p>', re.S)
items = pattern.findall(content)
# 规则返回
for item in items:
yield {
'index': item[0],
'image_url': item[1],
'title': item[2],
'protagonist': item[3].split(','),
'date': item[4],
'grade': item[5]+item[6]
}
def save_one_pagr(fo, item):
# 写入文件,不要编码为ascii,中文会乱码
data = json.dumps(item, ensure_ascii=False) + '\n'
fo.write(data)
def main():
# 找到每一页的规律
start_url = "http://maoyan.com/board/4?offset="
# 文件对象,追加,使用utf8编码写入
fo = open('result.txt', 'a', encoding='utf8')
# 这里只有10页
for i in range(10):
url = start_url + str(i*10)
html = crawl_one_page(url)
# 查看进度
print("[+] " + url)
# yield返回的是迭代器,保存到文件
for item in parse_one_page(html):
save_one_pagr(fo, item)
# 只打开一次,节省资源
fo.close()
if __name__ == '__main__':
main()
结果:
{"index": "1", "image_url": "http://p1.meituan.net/movie/20803f59291c47e1e116c11963ce019e68711.jpg@160w_220h_1e_1c", "title": "霸王别姬", "protagonist": ["张国荣", "张丰毅", "巩俐"], "date": "1993-01-01", "grade": "9.6"}