利用urllib库爬取猫眼电影网页排行榜数据,并通过re正则表达式分辨出电影排名、电影名、主演、上映时间、评分,并按txt文件存入
import urllib
import re
def load_data(page = 1):
offset = (page - 1) * 10
url = "https://maoyan.com/board/4?offset=" + str(offset)
resp = urllib.request.urlopen(url)
return resp.read().decode("utf-8")
def load_ranking(html):
reg = '<dd.*?<i.*?>(.*?)</i>'
return re.findall(reg,html,re.S)
def load_name(html):
reg = '<dd.*?<p\sclass="name".*?><a.*?>(.*?)</a>'
return re.findall(reg,html,re.S)
def load_info(html):
reg = '<dd.*?<div\sclass="board-item-main".*?<p\sclass="star".*?>(.*?)</p>'
rs = re.findall(reg,html,re.S)
return list(map(str.strip,rs))
def load_time(html):
reg = '<dd.*?<div\sclass="board-item-main".*?<p\sclass="releasetime".*?>(.*?)</p>'
rs = re.findall(reg,html,re.S)
return list(map(str.strip,rs))
def load_score(html):
reg = '<dd.*?<p\sclass="releasetime".*?<i\sclass="integer".*?>(.*?)</i>'
iteger = re.findall(reg,html,re.S)
reg_frag = '<dd.*?<p\sclass="releasetime".*?<i\sclass="fraction".*?>(.*?)</i>'
fraction = re.findall(reg_frag,html,re.S)
score = list(zip(iteger,fraction))
rs = map(lambda x: str(x[0]) + str(x[1]), score)
return list(rs)
def save_file(html):
records = zip(load_ranking(html),load_name(html),load_info(html),load_time(html),load_score(html))
infos = list(records)
with open("top.txt",'a+') as f:
for line in infos:
data = "\t".join(line)
print(data)
f.writelines(data)
#换行
f.write("\n")
for x in range(1,11):
html = load_data(x)
save_file(html)