Python 爬取 哔站视频弹幕 并实现词云图可视化

嗨喽,大家好呀~这里是爱看美女的茜茜呐

环境介绍:

  • python 3.8 解释器

  • pycharm 编辑器

第三方模块:

  • requests >>> pip install requests

  • protobuf >>> pip install protobuf

如何安装python第三方模块:

  1. win + R 输入 cmd 点击确定, 输入安装命令 pip install 模块名 (pip install requests) 回车

  2. 在pycharm中点击Terminal(终端) 输入安装命令

代码展示

import requests
import dm_pb2
from google.protobuf import text_format
import re
from datetime import datetime
import csv


with open("danmu.csv", mode='w', encoding='utf-8', newline='') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerow(["弹幕所在位置", "弹幕内容", "弹幕发布时间"])
headers = {
    'cookie': "buvid3=355AA300-6A61-04E5-A05C-E891D886F69632716infoc; b_nut=1675085932; i-wanna-go-back=-1; _uuid=387EA3810-FBF5-E92C-827E-2510B578C5B9A33232infoc; buvid4=15C69C98-F6A7-EC6A-872F-E69C1840DD6D33724-023013021-1pW1w45e5fZS9RtebDiGZw%3D%3D; nostalgia_conf=-1; rpdid=|(kmJY|k))lY0J'uY~l|)lmY|; b_ut=5; is-2022-channel=1; buvid_fp_plain=undefined; CURRENT_BLACKGAP=0; LIVE_BUVID=AUTO3216755179681630; header_theme_version=CLOSE; CURRENT_PID=17897430-d93d-11ed-a1f4-675e4c96ff79; FEED_LIVE_VERSION=V8; CURRENT_QUALITY=80; fingerprint=58d6d808ef27a6225c943be7ca980284; buvid_fp=58d6d808ef27a6225c943be7ca980284; enable_web_push=DISABLE; CURRENT_FNVAL=4048; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MDIzODAyNjYsImlhdCI6MTcwMjEyMTAwNiwicGx0IjotMX0.hHZgEl37y35RHgNUEbXnT3y_rtg_w3d1O46vW5TreIQ; bili_ticket_expires=1702380206; SESSDATA=0f019744%2C1717673066%2Ca41c0%2Ac2CjArLmPZFHNFg3B5H60pjRwiqJSLXDG8l2Pb_74Q11o8NmBWyKegdnFb6ivxUL255pwSVjRoaXFXVmFoRlFXY3VCRTAybEpud2ltaXFkRzZXQ25uZ3h0VGxrdGg3bWcxQ2hJN3d4VEZQRjRRTnd5cUx2TmJfUUdlWVZocVRfb281QnJHSklrTkJ3IIEC; bili_jct=f2a37b8a7351e9987d90f80d72dab593; DedeUserID=422789639; DedeUserID__ckMd5=fc4901c78719b545; b_lsid=125EDCFE_18C4E7B181A; home_feed_column=5; browser_resolution=1920-963; sid=6qcgbo4l; PVID=2",
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}
url = 'https://api.bilibili.com/x/v2/dm/wbi/web/seg.so?type=1&oid=323723441&pid=715024588&segment_index=1&pull_mode=1&ps=0&pe=120000&web_location=1315873&w_rid=8138667fe7c9a9d9aa23f488f69e5c2d&wts=1702124018'
# 1.发送请求
response = requests.get(url=url, headers=headers)
my_seg = dm_pb2.DmSegMobileReply()
data = response.content
my_seg.ParseFromString(data)
for i in my_seg.elems:
    parse_data = text_format.MessageToString(i, as_utf8=True)
    try:
        progress = re.findall('progress: (.*)', parse_data)[0]
    except:
        progress = 1000
    minutes, seconds = divmod(int(progress) // 1000, 60)
    current_time = f'{minutes:02d}:{seconds:02d}'
    content = re.findall('content: (.*)', parse_data)[0]
    ctime = re.findall('ctime: (.*)', parse_data)[0]
    date_time = datetime.fromtimestamp(int(ctime)).strftime('%Y-%m-%d %H:%M:%S')
    print(current_time, content, date_time)
    with open("danmu.csv", mode='a', encoding='utf-8', newline='') as f:
        csv_writer = csv.writer(f)
        csv_writer.writerow([current_time, content, date_time])
from datetime import datetime
import re
import requests
import dm_pb2
from google.protobuf import text_format
import csv


with open('danmu.csv', mode='w', encoding='utf-8', newline='') as f:
    csv_writer = csv.writer(f)
    csv_writer.writerow(['弹幕时间', '弹幕出现位置', '弹幕内容'])

def time_str_to_milliseconds(time_str):
    """将时间字符串转换为毫秒。"""
    h, m, s = map(int, time_str.split(':'))
    return ((h * 60 + m) * 60 + s) * 1000

# start_time = "00:23:58"
# end_time = "00:26:03"
# # 转换为毫秒
# start_ms = time_str_to_milliseconds(start_time)
# end_ms = time_str_to_milliseconds(end_time)


def get_data(url):
    headers = {
        'Cookie': "buvid3=5CB78B54-F1B3-FCE6-F1AD-C0831287EFD881020infoc; b_nut=1699856581; i-wanna-go-back=-1; b_ut=7; _uuid=F625CC83-C9D9-101035-7C36-D3BDFD6BE10CF80953infoc; enable_web_push=DISABLE; home_feed_column=5; DedeUserID=422789639; DedeUserID__ckMd5=fc4901c78719b545; header_theme_version=CLOSE; CURRENT_FNVAL=4048; buvid4=A6C069B5-4DB6-437A-1160-A2D1E031AFF772289-023083014-j%2BEVJ7V9TtLMVIMXjUkPKw%3D%3D; fingerprint=b3a2765a971ea2692a81ff8b1844fae5; buvid_fp_plain=undefined; buvid_fp=b3a2765a971ea2692a81ff8b1844fae5; rpdid=|(kmJYmkk~k)0J'uYmm)lY~k~; PVID=1; SESSDATA=1a664f71%2C1717565740%2C48bce%2Ac1CjCHJjBfBSiCSW6Dfm5CAL39PzQZEKS9eUW3s5GUBHFuBSQ-KUhgo1bPfAdpSv22A1oSVnhWOUkwbnprSnY4MEVnd1dkNXBFYTVQWk1fYkJkeUZjZmFsRjJSSDB0MndxRmFZRUJTQjRjd0xwMkY2ZWtZal9sTWV6azZZclRTQ0dVNmFzZW14N1FnIIEC; bili_jct=365ff75a8dd1510cb2cdd93895923f7e; sid=4ggq2j9r; bp_video_offset_422789639=872607904249675833; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MDIyNzI5NzcsImlhdCI6MTcwMjAxMzcxNywicGx0IjotMX0.Mn0QVb_HBWG4wdx-IaVgx9UB4CkJW8P5QVS4LDqQGvA; bili_ticket_expires=1702272917; browser_resolution=1562-1010; innersign=0; b_lsid=A5D8EDDF_18C4D46CC84",
        'Referer': "https://www.bilibili.com/bangumi/play/ep327584",
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
    }
    resp = requests.get(url=url, headers=headers)
    return resp

def parse_data(resp):
    my_seg = dm_pb2.DmSegMobileReply()
    # DATA是二进制数据
    # 比如你可以这么写
    DATA = resp.content
    # 或者这么写
    my_seg.ParseFromString(DATA)
    # 理论上此时文件已经被逆序列化了,你可以通过 print(my_seg.elems)来得到逆序列化后的数据
    for j in my_seg.elems:
        parse_data = text_format.MessageToString(j, as_utf8=True)
        pattern = r"id: (\d+)\nprogress: (\d+)\nmode: (\d+)\nfontsize: (\d+)\ncolor: (\d+)\nmidHash: \"([^\"]+)\"\ncontent: \"([^\"]+)\"\nctime: (\d+)\nweight: (\d+)\nidStr: \"([^\"]+)\""
        matches = re.finditer(pattern, parse_data)

        filtered_danmakus = []
        for match in matches:
            progress = int(match.group(2))

            # # 检查是否在指定时间范围内
            # if start_ms <= progress <= end_ms:
            # 转换 progress 为视频位置(格式:mm:ss)
            minutes, seconds = divmod(progress // 1000, 60)
            video_position = f"{minutes:02d}:{seconds:02d}"

            # 转换 ctime 为日期时间格式
            ctime = int(match.group(8))
            date_time = datetime.fromtimestamp(ctime).strftime('%Y-%m-%d %H:%M:%S')

            # 弹幕内容
            content = match.group(7)

            # 添加到筛选结果列表
            filtered_danmakus.append({
                "video_position": video_position,
                "date_time": date_time,
                "content": content
            })

        # 输出筛选后的弹幕
        for danmaku in filtered_danmakus:
            print("弹幕出现位置:", '00:'+danmaku["video_position"])
            print("弹幕时间:", danmaku["date_time"])
            print("弹幕内容:", danmaku["content"])
            print("----------")
            with open('danmu.csv', mode='a', encoding='utf-8', newline='') as f:
                csv_writer = csv.writer(f)
                csv_writer.writerow([danmaku["date_time"], '00:'+danmaku["video_position"], danmaku["content"]])


url_list = [
    'https://api.bilibili.com/x/v2/dm/wbi/web/seg.so?type=1&oid=197711172&pid=328492664&segment_index=1&pull_mode=1&ps=0&pe=120000&web_location=1315873&w_rid=3078e56400ad93df33859b09b8464f6b&wts=1702103538',
    'https://api.bilibili.com/x/v2/dm/wbi/web/seg.so?type=1&oid=197711172&pid=328492664&segment_index=1&pull_mode=1&ps=120000&pe=360000&web_location=1315873&w_rid=db9e8a1b66eacfb77d7e92762ac3fc4b&wts=1702103541',
    'https://api.bilibili.com/x/v2/dm/wbi/web/seg.so?type=1&oid=197711172&pid=328492664&segment_index=2&web_location=1315873&w_rid=9fe6b7defe3bcd611f6ec7bbd8a57553&wts=1702103541',
    'https://api.bilibili.com/x/v2/dm/wbi/web/seg.so?type=1&oid=197711172&pid=328492664&segment_index=3&web_location=1315873&w_rid=59a05c03d41c295ad57e0cd23db695eb&wts=1702103541',
    'https://api.bilibili.com/x/v2/dm/wbi/web/seg.so?type=1&oid=197711172&pid=328492664&segment_index=4&web_location=1315873&w_rid=48a794c85798922aac2ce4a5ad779544&wts=1702103541',
    'https://api.bilibili.com/x/v2/dm/wbi/web/seg.so?type=1&oid=197711172&pid=328492664&segment_index=5&web_location=1315873&w_rid=62fa8d41489f2b58f2a8577e3e654ef0&wts=1702103541',
    'https://api.bilibili.com/x/v2/dm/wbi/web/seg.so?type=1&oid=197711172&pid=328492664&segment_index=6&web_location=1315873&w_rid=0d9313ee507d135bce658616e694fb39&wts=1702103541',
    'https://api.bilibili.com/x/v2/dm/wbi/web/seg.so?type=1&oid=197711172&pid=328492664&segment_index=7&web_location=1315873&w_rid=151cf518a34b72ceeb35fec82b30cd43&wts=1702103541',
    'https://api.bilibili.com/x/v2/dm/wbi/web/seg.so?type=1&oid=197711172&pid=328492664&segment_index=8&web_location=1315873&w_rid=394bda938a8a775152f1ee7641d0d4bb&wts=1702103541'
]
for url in url_list:
    resp = get_data(url)
    parse_data(resp)

词云图

import pandas as pd
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 读取数据
df = pd.read_csv('danmu.csv')
text = " ".join(review for review in df['弹幕内容'])

# 生成词云图
wordcloud = WordCloud(width=800, height=800,
                      font_path=r'C:/Windows/Fonts/simhei.ttf',
                      background_color='white',
                      min_font_size=10).generate(text)

# 展示词云图
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis("off")
plt.tight_layout(pad=0)

plt.show()

尾语

感谢你观看我的文章呐~本次航班到这里就结束啦 🛬

希望本篇文章有对你带来帮助 🎉,有学习到一点知识~

躲起来的星星🍥也在努力发光,你也要努力加油(让我们一起努力叭)。

©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 212,185评论 6 493
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 90,445评论 3 385
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 157,684评论 0 348
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 56,564评论 1 284
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 65,681评论 6 386
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 49,874评论 1 290
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 39,025评论 3 408
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 37,761评论 0 268
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 44,217评论 1 303
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 36,545评论 2 327
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 38,694评论 1 341
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 34,351评论 4 332
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 39,988评论 3 315
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 30,778评论 0 21
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 32,007评论 1 266
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 46,427评论 2 360
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 43,580评论 2 349

推荐阅读更多精彩内容