【Python爬虫】B站《风灵玉秀》动画短评数据分析

Python数据分析结业作业,还有很多待优化的细节,request还是不太适合大型爬虫,回头慢慢修……

短评爬取

import requests
import json
from fake_useragent import UserAgent
import time
import datetime
import pandas as pd

#模拟浏览器
headers = {'User-Agent':UserAgent(verify_ssl=False).random}
comment_api = 'https://api.bilibili.com/pgc/review/short/list?media_id=6038&ps=20&sort=0' 

#发送get请求
response = requests.get(url = comment_api, headers=headers)
data_json = response.text
data = json.loads(data_json)

#取1000条短评,全量容易被封……
#alldata_num=data['data']['total']
alldata_num=1001

#定义数据格式
cols=['author','content','ctime','disliked','likes','score','progress']
df = pd.DataFrame(index = range(alldata_num),columns=cols)

#获取short comment
j = 0
while j < alldata_num :
    data_list = data['data']['list']
    for i in range(len(data_list)):
        df.loc[j,'author']=data_list[i]['author']['uname']
        df.loc[j,'content']=data_list[i]['content']
        df.loc[j,'ctime']=data_list[i]['ctime']        
        df.loc[j,'disliked']=data_list[i]['stat']['disliked']
        df.loc[j,'likes']=data_list[i]['stat']['likes']
        df.loc[j,'score']=data_list[i]['score']
        try:
            df.loc[j,'progress']=data_list[i]['progress']#提取进度易报错
        except:
            pass
        j += 1
        
        #提取cursor参数,进入下一个循环
        next_cursor = data['data']['next']       
        url = comment_api + '&cursor=' + str(next_cursor)
        response = requests.get(url=url, headers=headers)
        data_json = response.text
        data = json.loads(data_json)
        #防止被反爬虫机制阻断QAQ
        if j % 100 ==0 :
            print('————Have finished {}%————'.format(round(j/alldata_num*100,1)))
#            print(next_cursor)
        time.sleep(0.6)
                
    #存入文件,后续无需再爬
    df.to_csv('shortcomments.csv',index=False)

    for i in range(len(data_list)):
        next_cursor = data['data']['next']       
    url = comment_api + '&cursor=' + str(next_cursor)
    response = requests.get(url=url, headers=headers)
    data_json = response.text
    data = json.loads(data_json)                    

相关性数据分析

#读取数据
df = pd.read_csv('shortcomments.csv')

#去除空值
data_all = df.fillna(0)

#转换时间格式
def getDate(x):
    x = time.gmtime(x)
    return(pd.Timestamp(datetime.datetime(x[0],x[1],x[2])))
data_all['date'] = data_all.ctime.apply(lambda x: getDate(x))

#评论日期分布堆积图
from collections import Counter
import matplotlib.pyplot as plt
cdays = Counter(data_all['date'])
comm_days = pd.DataFrame(cdays.items(),columns = ['date','comment_num'])
comment_days = comm_days.sort_values(by='date')
comment_days.plot.area(x='date',y='comment_num',cmap='tab10_r')
评论日期分布
#计算评论距今天数
def Days(x):
    x = time.gmtime(x)
    return((datetime.datetime.now() - datetime.datetime(x[0],x[1],x[2],x[3],x[4],x[5])).days)
    
data_all['days'] = data_all.ctime.apply(lambda x: Days(x))
#短评长度
data_all['comm_len'] = data_all.content.apply(lambda x: len(x))

#数据相关性分析

#评分与评论距今天数散点图
plt.scatter(data_all['score'],data_all['days'], color='blue')

评分&评论距今时间关系
#多元拟合,评论获赞数与评论字数、评分、评论时间的关系

import numpy as np
import statsmodels.api as sm

X = pd.DataFrame(data_all.loc[:,['comm_len','score','days']], columns = ['comm_len','score','days']) 
y = pd.DataFrame(data_all['likes'], columns = ['likes'])

X_add1 = sm.add_constant(X)
model = sm.OLS(y, X_add1).fit()
print (model.summary())
相关性拟合
#仅距今日期显著性较高,删除评分和短评长度,重新训练
X.drop('score', axis = 1, inplace = True)
X.drop('comm_len', axis = 1, inplace = True)

X_add1 = sm.add_constant(X)
model = sm.OLS(y, X_add1).fit()
print(model.summary())

print(model.params) #输出回归系数
#模型拟合显著性不高,且R方较小,可以推送日期与点赞数相关,但数据解释力度小。

#假设测试数据为距今6天
X_test = np.array([1,6]) 

#利用模型的 predict 获得 预测 结果
print(model.predict(X_test))
#点赞数为负,显然不对,后期可以考虑挖掘更多数据进行模拟。比如发表用户的粉丝数等

重新拟合

绘制评论中高频词词云

#绘制短评内容的热点词云
import jieba
import jieba.posseg as pseg
from wordcloud import WordCloud
from PIL import Image        

#分词&过滤停用词
#停用词来自百度停用词库
stop_words = set(line.strip() for line in open('stopwords_baidu.txt'))

outstr = ''
for i in range(len(data_all)):
    jieba.suggest_freq(('风灵玉秀'), True)
    jieba.suggest_freq(('铃儿'), True)
    jieba.suggest_freq(('钰袖'), True)
    x = jieba.cut(data_all['content'][i])
    for word in x:
        if word not in stop_words:
             if word != '\t':
                 xx=' '.join(word)
                 outstr += xx

#统计词频、排序
content = dict(Counter(outstr))
content1 = sorted(content.items(),key=lambda content:content[1],reverse=True)
content2 = {}
for i in range(len(content1)):
    content2[content1[i][0]] = content1[i][1]

#画出词云  
mask_image = np.array(Image.open('heart.png'))
Wc = WordCloud(font_path='simhei.ttf', background_color='white', mask=mask_image, colormap='Oranges',width=900,height=600, max_words=50)
wordcloud = Wc.generate_from_frequencies(content2)

plt.imshow(wordcloud)
plt.axis("off")
plt.show()

词云

看起来都是好评~百合、有爱和能打的主题也完美体现了233
完结撒花✿✿ヽ(°▽°)ノ✿

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容