import requests
import re
import json
获取二进制流
def get_image(url):
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.content
return None
获取网页
def get_page(page):
url = 'https://maoyan.com/board/4?offset=%d' % (page * 10)
print(url)
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}
response = requests.get(url, headers=headers)
if response.status_code == 200:
return response.content.decode('utf-8')
return None
解析网页
def parse_page(html):
result = []
# 片名
pattern = re.compile('movieId:.*?>.*?<img.*?<img.*?alt="(.*?)" class.*?', re.S)
titles = re.findall(pattern, html)
# print(titles)
# 主演
pattern = re.compile('<p class="star">(.*?)</p>', re.S)
actors = re.findall(pattern, html)
actors = [ actor.strip() for actor in actors ]
# print(actors)
# 上映时间
pattern = re.compile('<p class="releasetime">(.*?)</p>', re.S)
releasetimes = re.findall(pattern, html)
releasetimes = [ releasetime.strip() for releasetime in releasetimes ]
# print(releasetimes)
# 评分
pattern = re.compile('<p class="score"><i class="integer">(.*?)</i><i class="fraction">(.*?)</i></p>', re.S)
scores = re.findall(pattern, html)
scores = [ ''.join(score) for score in scores ]
# print(scores)
# 排名
pattern = re.compile('<i class="board-index board-index-.*?">(.*?)</i>', re.S)
ranks = re.findall(pattern, html)
# print(ranks)
# 图片链接
pattern = re.compile('movieId:.*?>.*?<img.*?<img.*?src="(.*?)" alt=.*?', re.S)
covers = re.findall(pattern, html)
# print(covers)
for i in range(len(titles)):
data = {}
data['title'] = titles[i]
data['actor'] = actors[i]
data['releasetime'] = releasetimes[i]
data['score'] = scores[i]
data['rank'] = ranks[i]
data['cover'] = covers[i]
save_image(data['cover'])
result.append(data)
return result
保存数据
def save_json(result):
result_str = json.dumps(result, ensure_ascii=False)
with open('maoyan3.json', 'w', encoding='utf-8') as f:
f.write(result_str)
保存图片
def save_image(url):
image_content = get_image(url)
filename = url.split('/')[-1].split('@')[0]
filepath = './images/%s' % filename
with open(filepath, 'wb') as f:
f.write(image_content)
def main():
all_result = []
for page in range(10):
print('page: %d' % (page + 1))
html = get_page(page)
# print(html)
one_page_result = parse_page(html)
all_result.extend(one_page_result)
print(all_result)
save_json(all_result)
if name == 'main':
main( )