最基本的爬虫项目。

import requests
import re
import json

获取二进制流

def get_image(url):
headers = {
"User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)"
}

response = requests.get(url, headers=headers)
if response.status_code == 200:
    return response.content
return None

获取网页

def get_page(page):
url = 'https://maoyan.com/board/4?offset=%d' % (page * 10)
print(url)

headers =  {
    "User-Agent": "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)" 
}

response = requests.get(url, headers=headers)
if response.status_code == 200:
    return response.content.decode('utf-8')
return None

解析网页

def parse_page(html):
result = []

# 片名
pattern = re.compile('movieId:.*?>.*?<img.*?<img.*?alt="(.*?)" class.*?', re.S)
titles = re.findall(pattern, html)
# print(titles)
# 主演
pattern = re.compile('<p class="star">(.*?)</p>', re.S)
actors = re.findall(pattern, html)
actors = [ actor.strip() for actor in actors ]
# print(actors)
# 上映时间
pattern = re.compile('<p class="releasetime">(.*?)</p>', re.S)
releasetimes = re.findall(pattern, html)
releasetimes = [ releasetime.strip() for releasetime in releasetimes ]
# print(releasetimes)
# 评分
pattern = re.compile('<p class="score"><i class="integer">(.*?)</i><i class="fraction">(.*?)</i></p>', re.S)
scores = re.findall(pattern, html)
scores = [ ''.join(score) for score in scores ]
# print(scores)
# 排名
pattern = re.compile('<i class="board-index board-index-.*?">(.*?)</i>', re.S)
ranks = re.findall(pattern, html)
# print(ranks)

# 图片链接
pattern = re.compile('movieId:.*?>.*?<img.*?<img.*?src="(.*?)" alt=.*?', re.S)
covers = re.findall(pattern, html)
# print(covers)


for i in range(len(titles)):
    data = {}
    data['title'] = titles[i]
    data['actor'] = actors[i]
    data['releasetime'] = releasetimes[i]
    data['score'] = scores[i]
    data['rank'] = ranks[i]
    data['cover'] = covers[i]
    
    save_image(data['cover'])

    result.append(data)

return result

保存数据

def save_json(result):
result_str = json.dumps(result, ensure_ascii=False)
with open('maoyan3.json', 'w', encoding='utf-8') as f:
f.write(result_str)

保存图片

def save_image(url):
image_content = get_image(url)
filename = url.split('/')[-1].split('@')[0]
filepath = './images/%s' % filename
with open(filepath, 'wb') as f:
f.write(image_content)

def main():
all_result = []
for page in range(10):
print('page: %d' % (page + 1))
html = get_page(page)
# print(html)
one_page_result = parse_page(html)
all_result.extend(one_page_result)

print(all_result)
save_json(all_result)

if name == 'main':
main( )

获得的数据是json数据通过这个网站http://www.bejson.com/jsoneditoronline/ 可以转换成一般数据。

©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容