最近有部叫《西虹市首富》的电影特别火,所说一周票房就破10亿了。那么,这部电影到底有哪些人在看,有哪些评论,我们通过大数据来分析一下。
Scrapy的使用就不多说了,猫眼电影的评论也没有反爬措施,你们自己去体会一下。通过APP端接口直接爬取评论,不过只能看到前15000条评论,但对我们来说也足够了。
下面是爬虫部分的代码:
# -*- coding: utf-8 -*-
import scrapy
import json
from movie.items import MovieItem
class MovieSpider(scrapy.Spider):
name = 'rich'
allowed_domains = ['maoyan.com']
start_urls = []
for i in range(1, 1001):
base_url = "http://m.maoyan.com/mmdb/comments/movie/1212592.json?_v_=yes&offset=" + str(i)
start_urls.append(base_url)
def parse(self, response):
jcontent = json.loads(response.body)
jcmts = jcontent['cmts']
for each in jcmts:
item = MovieItem()
user = each['nickName']
city = each['cityName']
comment = each['content']
stars = each['score']
item['user'] = str(user).strip()
item['city'] = str(city).strip()
item['comment'] = str(comment).strip()
item['stars'] = str(stars).strip()
yield item
爬完之后,我们到数据库中看到的数据如下图:
这里明眼人一看就知道了,有很多重复的评论,很明显是有人刷出来的,至于是谁刷的,就不关咱们的事了。
数据库去重的方法很多,不过我们还可以借助Excel来去重,也非常方便。
重点来了,去重后的评论只有800多条!呵呵了
Anyway,我们也得到了一些评论...
就着这些评论,我们来分析一下。
首先用jieba和wordcloud这个库做了个词云,效果如下:
Python代码如下。有些地方要注意的:
- jieba要支持中文,需要中文字体,我这里下载了msyh.ttf(微软雅黑)的字体放在本地
- "shen.webp" 是原图,用来做Mask的,从豆瓣上下载的
- 以下程序支持txt、数据库、Excel读取评论这三种方式
#-*-coding:utf-8-*-
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import numpy as np
import jieba
import pymysql
from openpyxl import Workbook
from openpyxl import load_workbook
from PIL import Image
from pyecharts import Bar, Pie
class CommentAnalysis():
def __init__(self, *args, **kwargs):
# 配置
self.width = 900
self.height = 1200
self.max_words = 300
self.font_path = "msyh.ttf"
self.max_font_size = 200
self.random_state = 30
self.mask = np.array(Image.open("shen.webp"))
self.stars = []
self.citys = []
self.text = ""
#设置词云
self.wc = WordCloud(
background_color = "white", #设置背景颜色
width = self.width,
height = self.height,
mask = self.mask, #设置背景图片
max_words = self.max_words, #设置最大显示的字数
stopwords = self.SetStopWords(), #设置停用词
font_path = self.font_path,
max_font_size = self.max_font_size, #设置字体最大值
random_state = self.random_state, #设置有多少种随机生成状态,即有多少种配色方案
)
# 连接数据库
self.dbName = "movie"
self.dbTable = "rich"
self.dbHost = "localhost"
self.dbPort = "3306"
self.dbUser = "root"
self.dbPwd = "3.1415926"
def LoadCmtFromText(self, filename):
text = open(filename,"rb").read()
#结巴分词
wordlist = jieba.cut(text, cut_all=False, HMM=False)
wl = " ".join(wordlist)
#print(wl)#输出分词之后的txt
return wl
# 连接数据库
def ConnectDB(self):
try:
self.db = pymysql.connect(host=self.dbHost,
port=int(self.dbPort),
user=self.dbUser,
password=self.dbPwd,
db=self.dbName,
use_unicode=True,
charset="utf8")
# print('连接上了')
self.cursor = self.db.cursor()
return True
except Exception:
# print(u"连接服务器失败!")
return False
def LoadCmtFromDB(self):
self.ConnectDB()
sql = """
SELECT * FROM rich
"""
self.cursor.execute(sql)
results = self.cursor.fetchall()
# print(results)
cmt = ""
for rows in results:
cmt += rows[4] + "\n"
# print(text)
# 结巴分词
wordlist = jieba.cut(cmt, cut_all=False, HMM=False)
wl = " ".join(wordlist)
return wl
def LoadCmtFromExcel(self, filename, sheetname):
wb = load_workbook(filename)
ws = wb[sheetname]
ws_rows_len = 10000 # 评论总行数
cmt_column = 5 # 评论所在列
stars_column = 4 # 评分所在列
city_column = 3 # 城市所在列
# 有效行开始
start_rows = 1
# 评论
cmt = ""
for row in range(start_rows, ws_rows_len+1):
cmt_data = ws.cell(row=row, column=cmt_column).value
stars_data = ws.cell(row=row, column=stars_column).value
city_data = ws.cell(row=row, column=city_column).value
if cmt_data:
cmt += cmt_data + "\n"
if stars_data:
self.stars.append(stars_data)
if city_data:
self.citys.append(city_data)
# print(cmt)
# 结巴分词
wordlist = jieba.cut(cmt, cut_all=False, HMM=False)
self.text = " ".join(wordlist)
def SetStopWords(self):
stopwords = STOPWORDS.copy()
stopwords.add("电影")
return stopwords
def ScoresCount(self, data):
dict = {}
for key in data:
dict[key] = dict.get(key, 0) + 1
# print(dict)
return dict
def Render(self):
attr = ["0.5分", "1分", "1.5分", "2分", "2.5分", "3分", "3.5分", "4分", "4.5分", "5分"]
stars_dict = self.ScoresCount(self.stars)
data = [stars_dict["0.5"],
stars_dict["1"],
stars_dict["1.5"],
stars_dict["2"],
stars_dict["2.5"],
stars_dict["3"],
stars_dict["3.5"],
stars_dict["4"],
stars_dict["4.5"],
stars_dict["5"]]
bar = Bar()
bar.add("《西虹市首富》评分", attr, data)
bar.render()
# pie = Pie("饼图-圆环图示例", title_pos='center')
# pie.add("商品B", attr, data, center=[50, 50], is_random=True,
# radius=[50, 75], rosetype='area',
# is_legend_show=False, is_label_show=True)
# pie.render()
# 数据处理
def ProcessData(self):
# 通过txt读取内容
# wl = self.LoadCmtFromText("rich.txt")
# 通过数据库读取内容
# text = self.LoadCmtFromDB()
# 通过Excel读取内容
self.LoadCmtFromExcel("rich.xlsx", "rich")
self.myword = self.wc.generate(self.text)#生成词云
# 显示
def Show(self):
# 显示Mask
plt.subplot(1,2,1)
plt.imshow(self.mask)
plt.axis("off")
# 显示词云图
plt.subplot(1,2,2)
plt.imshow(self.myword)
plt.axis("off")
plt.show()
if __name__ == "__main__":
cmt = CommentAnalysis()
cmt.ProcessData()
cmt.Render()
cmt.Show()
再把评分画个图如下(代码中的Render函数):