第二周作业 爬取糗事百科用户名、内容、好笑数和点赞数

import requests

from lxml import etree

import xlwt

header = {

    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'

}

all_info_list = []

def get_info(url):

    res = requests.get(url, headers = header)

    html = etree.HTML(res.text)

    infos = html.xpath('//div[@class="col1"]/div')

    for info in infos:

        try:

            id = info.xpath('div[1]/a[2]/h2/text()')[0]

            content1 = info.xpath('a[1]/div/span[1]')[0]

            content = content1.xpath('string(.)').strip()

            laugh_num = info.xpath('div[2]/span[1]/i/text()')

            thumbs_up = info.xpath('a[2]/div/div/div/text()')

            info_list = [id, content, laugh_num, thumbs_up]

            all_info_list.append(info_list)

        except IndexError:

            pass

if __name__ == '__main__':

    book = xlwt.Workbook(encoding='utf-8')

    sheet = book.add_sheet('Sheet1')

    column = ['id', 'content', 'laugh_num', 'thumbs_up']

    for t in range(4):

        sheet.write(0, t, column[t])

    urls = [f'https://www.qiushibaike.com/text/page/{str(i)}/' for i in range(1,14)]

    for url in urls:

        get_info(url)

    i = 1

    for list in all_info_list:

        j = 0

        for data in list:

            sheet.write(i, j, data)

            j = j + 1

        i = i + 1

    book.save('C:/Users/Harron/Desktop/test1.xls')

©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容

  • **2014真题Directions:Read the following text. Choose the be...
    又是夜半惊坐起阅读 10,034评论 0 23
  • rljs by sennchi Timeline of History Part One The Cognitiv...
    sennchi阅读 7,486评论 0 10
  • "use strict";function _classCallCheck(e,t){if(!(e instanc...
    久些阅读 2,061评论 0 2
  • 10:26 因为不努力自己像泄气的皮球,因为努力没成功自己像本打算长途旅行的游客突然半路熄火,所有事情存在的原因都...
    静喵禾阅读 247评论 0 1
  • 最近有空就在细读仇老大给大伙推荐的《今天怎样做教科研》。很喜欢这本书,有的部分还多次拜读。冯卫东老师笔下的科研之路...
    Qiuiu阅读 220评论 0 0