猫眼(第九次作业)

爬取猫眼电影榜单

import requests

from lxml import etree

import csv

headers = {

'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3724.8 Safari/537.36'

}

def get_url(url):

res = requests.get(url,headers=headers)

html = etree.HTML(res.text)

infos = html.xpath('//dl[@class="board-wrapper"]/dd')

for infoin infos:

title = info.xpath('div/div/div[1]/p[1]/a/text()')[0]

author = info.xpath('div/div/div[1]/p[2]/text()')[0].strip().strip('主演:')

pub_time = info.xpath('div/div/div[1]/p[3]/text()')[0].strip('上映时间:')

star_1 = info.xpath('div/div/div[2]/p/i[1]/text()')[0]

star_2 = info.xpath('div/div/div[2]/p/i[2]/text()')[0]

star = star_1 + star_2

movie_url ='https://maoyan.com' + info.xpath('div/div/div[1]/p[1]/a/@href')[0]

get_info(movie_url,title,author,pub_time,star)

def get_info(url,title,author,pub_time,star):

res = requests.get(url, headers=headers)

html = etree.HTML(res.text)

style = html.xpath('/html/body/div[3]/div/div[2]/div[1]/ul/li[1]/text()')[0]

long_time = html.xpath('/html/body/div[3]/div/div[2]/div[1]/ul/li[2]/text()')[0].split('/')[1].strip().strip('分钟')

writer.writerow([title,author,pub_time,star,style,long_time])

if __name__ =='__main__':

fp =open('maoyan.csv', 'w', newline='', encoding='utf-8')

writer = csv.writer(fp)

writer.writerow(['title', 'author', 'pub_time', 'star', 'style', 'long_time'])

urls = ['https://maoyan.com/board/4?offset={}'.format(str(i))for iin range(0, 100, 10)]

for urlin urls:

get_url(url)


分析


最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
【社区内容提示】社区部分内容疑似由AI辅助生成,浏览时请结合常识与多方信息审慎甄别。
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

友情链接更多精彩内容