糗事

import requests

from lxml import etree

import xlwt

headers = {

'User-Aegnt':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'

}

all_info_lists = []

def get_info(url,headers=headers):

    res = requests.get(url)

    html = etree.HTML(res.text)

    infos = html.xpath('//div[@class="col1"]/div')

    for infoin infos:

        try:

            id = info.xpath('div[1]/a[2]/h2/text()')[0]

            content1 = info.xpath('a[1]/div/span[1]')[0]

            content = content1.xpath('string(.)').strip()

            laugh = info.xpath('div[2]/span[1]/i/text()')[0]

            comment = info.xpath('div[2]/span[2]/a/i/text()')[0]

            info_list = [id,content,laugh,comment]

            all_info_lists.append(info_list)

        except IndexError:

            pass

if __name__ =='__main__':

    book = xlwt.Workbook(encoding='utf-8')

    sheet = book.add_sheet('Sheet1')

    header = ['id', 'content', 'laugh', 'comment']

    for tin range(len(header)):

            sheet.write(0, t, header[t])

    urls = ['https://www.qiushibaike.com/text/page/{}/'.format(str(i))for iin range(1,14)]

    for url in urls:

        get_info(url)

    i =1

    for listin all_info_lists:

        j =0

        for datain list:

            sheet.write(i, j, data)

            j +=1

        i +=1

    book.save('C:/Users/admin/Desktop/test.xls')

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容