Python爬虫——淘宝商品信息定向爬虫

步骤

获取网页文本
将网页文本中所需内容存入列表
打印列表
注意
需要登陆后的Cookie信息
对每一页进行遍历
正则表达式的书写

import requests
import re


def getHTMLText(url, hd):
    try:
        r = requests.get(url, headers=hd, timeout=30)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        return "Error"


def parsePage(ilt, html):
    try:
        # 强化正则表达式书写能力, 好好学习这两个正则表达式是怎么写的
        plt = re.findall(r'\"view_price\"\:\"[\d\.]*\"', html)
        tlt = re.findall(r'\"raw_title\"\:\".*?\"', html)  # 限制最小匹配
        for i in range(len(plt)):
            price = eval(plt[i].split(':')[1])  # eval函数可以去掉外层双引号
            title = eval(tlt[i].split(':')[1])
            ilt.append([price, title])
    except:
        print("")


def printGoodsList(ilt):
    tplt = "{0:4}\t{1:8}\t{2:16}"
    print(tplt.format("序号", "价格", "商品名称"))
    count = 0
    for g in ilt:
        count = count + 1
        print(tplt.format(count, g[0], g[1]))


def main():
    goods = ''  # 搜索商品名称
    depth = 2  # 爬取深度（页数）
    start_url = 'https://s.taobao.com/search?q=' + goods
    hd = {
        'User-Agent': 'Chrome/10.0',
        'Cookie': ''
    }
    infoList = []
    #  对每个页面进行循环处理
    for i in range(depth):
        #  用try-except进行异常判断
        try:
            url = start_url + '&s=' + str(i * 44)  # 每页有44个商品
            html = getHTMLText(url, hd)
            parsePage(infoList, html)
        except:
            continue
    printGoodsList(infoList)


main()

Python爬虫——淘宝商品信息定向爬虫

推荐阅读更多精彩内容