今天主要是让自己完成一个项目,自己动手去爬取猫眼电影top100具体实现如下,仅供参考
- 猫眼电影top100
import requests
from lxml import etree
def parse():
headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36"}
movie_info_list =[]
for i in range(0,91,10):
url = "https://maoyan.com/board/4?offset={}".format(i)
req = requests.get(url , headers = headers)
req.encoding = 'utf-8'
data = req.text
html = etree.HTML(data)
movie_info = html.xpath("//div[@class='main']/dl/dd")
for movie in movie_info:
#电影名
name = movie.xpath("./div/div/div[1]/p[1]/a/text()")
name = '' if len(name) == 0 else name[0]
#主演
role = movie.xpath("./div/div/div[1]/p[2]/text()")
role = 'role:' if len(role) == 0 else role[0]
role = role.replace('role:','')
role = role.strip()
#上映时间
time = movie.xpath("./div/div/div[1]/p[3]/text()")
time = 'time:' if len(time) == 0 else time[0]
time = time.replace('time:','')
movie_info_list.append({
'name': name,
'role': role,
'time': time
})
for movie_info in movie_info_list:
print(movie_info)
parse()
图片爬取和详情连接的爬取在此并没有给出,下面我会给出
豆瓣网的爬取案例。
- 豆瓣电影top250
import requests
from lxml import etree
import pandas as pd
def parse():
"""豆瓣网top250爬虫"""
# 1、获取url地址
# for i in range(0, 226, 25):
# url = 'https://movie.douban.com/top250?start={}&filter='.format(i)
# print(url)
# # 获取 byte的类型的响应
# resp = requests.get(url)
# data = resp.content
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"}
# 定义列表存储top250信息 [{},{},{}]
movie_info_list = []
# 循环每一页
for i in range(0, 226, 25):
url = 'https://movie.douban.com/top250?start={}&filter='.format(i)
# 获取 byte的类型的响应
resp = requests.get(url, headers=headers)
data = resp.content
# 调用etree.HTML获取html对象,然后调用html的xpath语法
html = etree.HTML(data)
movie_list = html.xpath('//div[@id="content"]//ol/li')
# print(len(movie_list))
for movie in movie_list:
# 获取电影序号
serial_number = movie.xpath('./div[@class="item"]/div[@class="pic"]/em/text()')
serial_number = '' if len(serial_number) == 0 else serial_number[0]
# print(serial_number)
# 电影名称
movie_name = movie.xpath('./div[@class="item"]/div[@class="info"]/div[@class="hd"]/a/span[1]/text()')
movie_name = '' if len(movie_name) == 0 else movie_name[0]
# print(movie_name)
# 电影介绍
introduce = movie.xpath('./div[@class="item"]/div[@class="info"]/div[@class="bd"]/p[1]/text()')
introduce = '' if len(introduce) == 0 else introduce[0]
# 去两端空格操作
introduce = introduce.strip()
# print(introduce)
# 电影星级
star = movie.xpath('./div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[2]/text()')
star = '' if len(star) == 0 else star[0]
# print(star)
# 电影的评价
evalute = movie.xpath('./div[@class="item"]/div[@class="info"]/div[@class="bd"]/div[@class="star"]/span[4]/text()')
evalute = '人评价' if len(evalute) == 0 else evalute[0]
evalute = evalute.replace('人评价', '')
# print(evalute)
# 电影的描述
describe = movie.xpath('./div[@class="item"]/div[@class="info"]/div[@class="bd"]/p[@class="quote"]/span[@class="inq"]/text()')
describe = '' if len(describe) == 0 else describe[0]
# print(describe)
# 详情链接地址
detail_link = movie.xpath('./div[@class="item"]/div[@class="pic"]/a/@href')
detail_link = '' if len(detail_link) == 0 else detail_link[0]
# print(detail_link)
# 图片地址
img_url = movie.xpath('./div[@class="item"]/div[@class="pic"]/a/img/@src')
img_url = '' if len(img_url) == 0 else img_url[0]
# print(img_url)
movie_info_list.append({
'serial_number': serial_number,
'movie_name':movie_name,
'introduce':introduce,
'star':star,
'evalute':evalute,
'describe':describe,
'detail_link': detail_link,
'img_url':img_url
})
for movie_info in movie_info_list:
print(movie_info)
resp = requests.get(movie_info['img_url'])
if resp.status_code == 200:
# 执行图片写入操作
# 参考图片命名方式 0000001.jpg
img_name = '000000{}.jpg'.format(movie_info['serial_number'])
with open('./imgs/{}'.format(img_name), 'wb') as f:
f.write(resp.content)
# 存储成csv
df = pd.DataFrame(movie_info_list)
df.to_csv('douban_top250_info.csv')
parse()
csv文件是用逗号分隔开的一种文件。具体解释如下:
逗号分隔值(Comma-Separated Values,CSV,有时也称为字符分隔值,因为分隔字符也可以不是逗号),其文件以纯文本形式存储表格数据(数字和文本)。纯文本意味着该文件是一个字符序列,不含必须像二进制数字那样被解读的数据。CSV文件由任意数目的记录组成,记录间以某种换行符分隔;每条记录由字段组成,字段间的分隔符是其它字符或字符串,最常见的是逗号或制表符。通常,所有记录都有完全相同的字段序列。通常都是纯文本文件。建议使用WORDPAD或是记事本来开启,再则先另存新档后用EXCEL开启,也是方法之一。
CSV文件格式的通用标准并不存在,但是在RFC 4180中有基础性的描述。使用的字符编码同样没有被指定,但是bitASCII是最基本的通用编码。