python爬虫代码-爬取bilibili-live开播房间信息

# -*- encoding:utf-8 -*-
# 作者:大帅
# 目标:获取bilibili直播所有开播房间信息并保存在本地

import requests,bs4,re,time
import xlwings as xw
from selenium import webdriver
from bs4 import BeautifulSoup

roomlist = []


# 获取bilibili动态页面
def gethtml(url):

    #n = 0

    try:

        driver = webdriver.Chrome()

        driver.get(url)

        button_view = driver.find_element_by_xpath('//*[@id="room-list-section"]/div[1]/div/div[1]/div[2]')

        button_view.click()

        
        
        while True:

            try:

                button = driver.find_element_by_xpath('//*[@id="room-list-section"]/div[2]/div[1]/span[1]')

                time.sleep(1)

                #n += 1
                
                button.click()

            except:

                break
        

        html = driver.page_source

        file_html = open(r'f:\workspace\example\livehtml.txt','w+',encoding='utf-8')

        file_html.write(html)

        file_html.close()

        driver.quit()

        return html

    except:

        print('执行异常')

# 获取room详情并储存在一个2维列表里

def getroomlist(html):

    soup = BeautifulSoup(html,"html.parser")

    for li in soup.find('section',id = 'room-list-section').ul:

    # 获取房间的主播名和房间名

        pre_rt = r'title=".*"'
        pattern1 =re.compile(pre_rt)
        rt_match =re.findall(pattern1,str(li))

        try:

            if len(rt_match) == 0:
                continue
            else:
                room_title = rt_match[0].split('"')[1]
                anchor_name = rt_match[1].split('"')[1]
        


    # 获取房间的人气值

            pre_view = r'>\d+<'
            pattern2 = re.compile(pre_view)
            vc_match = pattern2.search(str(li))
            if vc_match == None:
                continue
            else:
                pre_vc_value = vc_match.group(0)
                vc_value = re.sub(r'>|<','',pre_vc_value)
                roomlist.append([anchor_name,room_title,vc_value])
        except:
            print(anchor_name)
            break
  

    return roomlist

    
# 将房间信息保存在xlsx文件里方便后期处理和存储
def savelivelist(livelist):

    app = xw.App(visible=True,add_book=False)

    app.display_alerts = False

    app.screen_updating = False

    wb = app.books.open(r'f:\workspace\example\live.xlsx')

    sheet1 = wb.sheets['sheet1']

    sheet1.range('A1').value = ['主播名','房间名','人气值']

    sheet1.range('A2').value = livelist

    wb.save()

    wb.close()

    app.quit()

    print ('完成存储')

# 主函数
def main():
    url = 'http://live.bilibili.com/all'

    html = gethtml(url)

    livelist = getroomlist(html)

    #print(len(livelist))

    savelivelist(livelist)

main()


©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容