【2021-07-16】群买买爬虫

pdd_spider

# -*- coding: utf-8 -*-
import requests
import logging
from logger import logger
import time
import random
import json
import os


def write_json(filename, json_store):
    with open(filename, 'w+') as file:
        json.dump(json_store, file)
    pass


def read_json(filename):
    with open(filename) as file_obj:
        data = json.load(file_obj)
    return data


def post_s(url, data, headers=None, accesstoken=None):
    if headers is None:
        headers = {
            'cache-control': 'no-cache',
            'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
            'verifyauthtoken': '',
            'accesstoken': 'KE56LAPGPJPR7CTVN73JOKNCEEQ6DVMFMBWFB4NFIDOSJELM76DQ113204e',
            'code-version': '1.2.41',
            'content-type': 'application/json;charset=UTF-8',
            'p-appname': 'new_cps_shop',
            'rfp': 'Qq6LZv8VJQfKT1420V0DAwbx6Mv58Gl3',
            'referer': 'https://servicewechat.com/wxa7d57206dc1eaf51/194/page-frame.html',
            'accept-encoding': 'gzip, deflate, br',
            'content-length': '551'
        }
    else:
        pass
    if accesstoken is not None:
        headers["accesstoken"] = accesstoken
    else:
        pass
    i = 1
    while i < 11:
        res = requests.post(url=url, headers=headers, json=data)
        try:
            res_json = res.json()
        except:
            logger.warning("response 解析失败,开始第{}次重试".format(i))
            logger.warning(res)
            logger.warning(res.text)
            logger.warning(url)
            logger.warning(headers)
            logger.warning(data)
            i += 1
            time.sleep(random_second(10))
            continue
        if res_json.get("success") is True:
            logger.info("请求成功")
            logger.info(res_json)
            return res_json
        else:
            logger.warning("请求失败,原因见下方res_json,开始第{}次重试".format(i))
            logger.warning(res_json)
            logger.warning(url)
            logger.warning(headers)
            logger.warning(data)
            i += 1
            if res_json.get("error_code") is not None:
                time.sleep(random_second(10))
            else:
                time.sleep(1)
            continue
    logger.warning("请求失败,程序停止,返回空结果")
    return {}


def random_second(expect):
    return round(random.uniform(expect-0.2,expect+0.2),2)

def get_subject_list():
    def get_subject_list_uniq_page(cat_id, page):
        url = "https://api.pinduoduo.com/api/brand-philips-salesman/subject/brand/list?xcx_version=1.2.41"
        data = {
            "open_app_source": 1007,
            "wxapp_uuid": "5dwvfq9d803ynl8ru7fdbdrpzvtegyla",
            "anti_content": "3eE4eFwV3FO-kcBAEaa99UTLK9o",
            "biz_sn": "10_896510356",
            "xcx_version": "1.2.41",
            "cat_id": cat_id,
            "subject_status": 1,
            "page": page
        }
        res_json = post_s(url=url, data=data)
        if res_json.get('result').get('page') == -1:
             return [{"subject_id": i.get("subject_id"), "subject_title": i.get('title'),
                        "goods_count": i.get("goods_count"), "subject_cat": cat_id, "detailed_info": i} for i in res_json.get('result').get('subject_vos')].append(-1)
        if res_json is None:
            return []
        else:
            return [{"subject_id": i.get("subject_id"), "subject_title": i.get('title'),
                        "goods_count": i.get("goods_count"), "subject_cat": cat_id, "detailed_info": i} for i in res_json.get('result').get('subject_vos')].append(1)

    subject_list = []
    for cat_id in [111,222,1,2,3,4,5,6,7,8,9,10,11,12,333,11]:
        for page in range(1,100):
            time.sleep(random_second(2))
            subject_list_uniq_page = get_subject_list_uniq_page(cat_id=cat_id, page=page)
            if subject_list_uniq_page[-1] == -1:
                subject_list.extend(subject_list_uniq_page[:-1])
                break
            else:
                subject_list.extend(subject_list_uniq_page[:-1])
    return subject_list


def get_goods_list(subject_id):
    def get_goods_list_uniq_page(subject_id, page):
        url = "https://api.pinduoduo.com/api/brand-philips-salesman/subject/goods/list?xcx_version=1.2.41"
        data = {
            "open_app_source": 1089,
            "wxapp_uuid": "r4ucawh6mp8r0ksxsyhws9e4clqrsjkl",
            "anti_content": "3akAfx5e-eCE65sa_4CM2-T-tPTdf_hSBsSRtVVE-eAmSklTMZ-zLk-0d5tTHWTpFSUck2EOYC5hy9oXqNFOTf4qH9sUbS_NjG4gJnqg4afcBwxkvfzkMPf5sEXj5CDYlwHPBOSe-sCEzeKeBFKE-3SEtxIkBeUk-sVeBwVeBeCeBnSEB6Kxj5vKJpHAp0MDmXPKFTGQTXiNaqlpQiC2dn5QmQZYQYZ4Kg8PYj0Sg_6CSswZvieE-KdAf2fU6nbUScWmdL1AELMDrVxe--bz7bK92xH_Cdo",
            "biz_sn": "10_896510356",
            "xcx_version": "1.2.41",
            "subject_id": str(subject_id),
            "size": 10,
            "page": page,
            "sort_type": "default",
            "search_type": 1,
            "type": 1
        }
        res_json = post_s(url=url, data=data)
        if res_json.get('result').get('page') == -1:
            goods_list_uniq_page = [
                {"subject_id": subject_id, "goods_id": i.get("goods_id"), "goods_name": i.get('goods_name'),
                 "detailed_info": i} for i in res_json.get('result').get('goods_list')]
            goods_list_uniq_page.append(-1)
            return goods_list_uniq_page
        if res_json is None:
            return []
        else:
            goods_list_uniq_page = [
                {"subject_id": subject_id, "goods_id": i.get("goods_id"), "goods_name": i.get('goods_name'),
                 "detailed_info": i} for i in res_json.get('result').get('goods_list')]
            goods_list_uniq_page.append(-1)
            return goods_list_uniq_page
    goods_list = []
    for page in range(1, 100):
        time.sleep(random_second(2))
        goods_list_uniq_page = get_goods_list_uniq_page(subject_id=subject_id, page=page)
        if goods_list_uniq_page[-1] == -1:
            goods_list.extend(goods_list_uniq_page[:-1])
            break
        else:
            goods_list.extend(goods_list_uniq_page[:-1])
    return goods_list


def get_sku_list(subject_id, goods_id):
    url = "https://api.pinduoduo.com/api/brand-philips/goods/detail/extra/info?xcx_version=1.2.41"
    data = {
        "open_app_source": 1089,
        "wxapp_uuid": "r4ucawh6mp8r0ksxsyhws9e4clqrsjkl",
        "anti_content": "3akAfx5e-eCE650a_4CM2-T-tPTdf_oSBsSRtVVE-eAmSklTMZ-zLk-0d5tTOWTpFS-hk2EcYC5oy9I1_CxGqdNX_m6aet9_xk7f-zFfKW9knB4xlpOxQpdRBpiiQYNFBPP1kZMmpkEquF9YOpEaOpXycG_YnpXxnGXJsG7yXYPjXG4qX54qXY9qXDTaXG_5rB1pRMEN84zzkEwptUzACeL3VUm-Wr24SDBASDs-WMss_fcwM1BTPUl2Tl5gHRqxn_SN44pPlWaf2mIb3xdNxxQJJ05YnfaOIx_9tMe_FSI",
        "biz_sn": "10_896510356",
        "xcx_version": "1.2.41",
        "goods_id": str(goods_id),
        "use_ticket_of_wxapp": "",
        "from_env": 27,
        "subject_id": str(subject_id)
    }
    time.sleep(random_second(2))
    res_json = post_s(url=url, data=data)
    if res_json is None:
        return -1
    else:
        return [{"subject_id": subject_id, "goods_id": i.get("goods_id"), "sku_id": i.get('sku_id'), "detailed_info": i} for i in res_json.get("result").get("sku_list")]
# -*- coding: utf-8 -*-
import csv
import time

import redis

from logger import logger
from pdd_spider import post_s, random_second

pool = redis.ConnectionPool(host='localhost', port=6379, decode_responses=True)
r = redis.Redis(connection_pool=pool)
date = time.strftime("_%m_%d", time.localtime())


def store_list_to_redis(name, list: list):
    for i in list:
        r.sadd(name, i)


def store_subject(subject_list_uniq_page):
    for subject in subject_list_uniq_page:
        temp = "unknown" if subject.get("goods_count") == None else subject.get("goods_count")
        subject["goods_count"] = temp
        subject_id = subject.get("subject_id")
        name = str(subject_id) + date
        logger.info(subject)
        r.hmset(name, subject)


def get_subject_list_uniq_page(cat_id, page, subject_status):
    url = "https://api.pinduoduo.com/api/brand-philips-salesman/subject/brand/list?xcx_version=1.2.41"
    data = {
        "open_app_source": 1007,
        "wxapp_uuid": "5dwvfq9d803ynl8ru7fdbdrpzvtegyla",
        "anti_content": "3akAfa5e-eCEG5Bo_09-ece7fwWMfizPZ8_7s4Hz3xw7L4W-eA_zCgVHTIto9zpZVGYdHPpmJPTGgdn5DQ2pXbBmUe-wUkzaZDBsUEBVckBFKkLV-eBeCk-3Se-FHEzF5kB2CD-WSavXqI5ePn_A4PEdbJj5QdXXh54CvDUFamIB2SSvfZeFfu0c27NDdeLV-eLVVytCEv-QFsFs-sACzMAPEE-wCeV2ZLl3dUveI3FKVDclmE9TDK4ATq",
        "biz_sn": "10_896510356",
        "xcx_version": "1.2.42",
        "cat_id": cat_id,
        "subject_status": subject_status,
        "page": page
    }
    res_json = post_s(url=url, data=data)
    if res_json is {}:
        return []
    if res_json.get('result').get('page') == -1:
        subject_list = [{"subject_id": i.get("subject_id"), "subject_title": i.get('title'),
                         "goods_count": i.get("goods_count"), "subject_cat": cat_id, "subject_status": subject_status}
                        for i in
                        res_json.get('result').get('subject_vos')]
        subject_list.append(-1)
        return subject_list
    if res_json is None:
        return []
    else:
        subject_list = [{"subject_id": i.get("subject_id"), "subject_title": i.get('title'),
                         "goods_count": i.get("goods_count"), "subject_cat": cat_id, "subject_status": subject_status}
                        for i in
                        res_json.get('result').get('subject_vos')]
        subject_list.append(1)
        return subject_list


def get_subject_info():
    cat_id_list = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 11]
    # 111主推 222上新 1女装 2男装 3母婴 4食品 5内衣 6家居家纺 7鞋包配饰 8美妆个护 9运动 10日用百货 12数码3c 333首发 11更多
    store_list_to_redis("cat_id_list", cat_id_list)
    cat_id_list_remain = r.sdiff("cat_id_list", "cat_id_list_finish")
    logger.info("剩余类目任务列表如下:")
    logger.info(cat_id_list_remain)
    for cat_id in cat_id_list_remain:
        for subject_status in [0, 1]:
            for page in range(1, 100):
                logger.info("cat_id={}, page={}, subject_status={}".format(str(cat_id), str(page), str(subject_status)))
                subject_list_uniq_page = get_subject_list_uniq_page(cat_id=cat_id, page=page,
                                                                    subject_status=subject_status)
                time.sleep(random_second(10))
                if subject_list_uniq_page is []:
                    break
                if subject_list_uniq_page[-1] == -1:
                    store_subject(subject_list_uniq_page[:-1])
                    break
                else:
                    store_subject(subject_list_uniq_page[:-1])
        r.sadd("cat_id_list_finish", cat_id)


def get_goods_list_uniq_page(subject_id, page):
    url = "https://api.pinduoduo.com/api/brand-philips-salesman/subject/goods/list?xcx_version=1.2.41"
    data = {
        "open_app_source": 1089,
        "wxapp_uuid": "r4ucawh6mp8r0ksxsyhws9e4clqrsjkl",
        "anti_content": "3akAfx5e-eCE65sa_4CM2-T-tPTdf_hSBsSRtVVE-eAmSklTMZ-zLk-0d5tTHWTpFSUck2EOYC5hy9oXqNFOTf4qH9sUbS_NjG4gJnqg4afcBwxkvfzkMPf5sEXj5CDYlwHPBOSe-sCEzeKeBFKE-3SEtxIkBeUk-sVeBwVeBeCeBnSEB6Kxj5vKJpHAp0MDmXPKFTGQTXiNaqlpQiC2dn5QmQZYQYZ4Kg8PYj0Sg_6CSswZvieE-KdAf2fU6nbUScWmdL1AELMDrVxe--bz7bK92xH_Cdo",
        "biz_sn": "10_896510356",
        "xcx_version": "1.2.41",
        "subject_id": str(subject_id),
        "size": 10,
        "page": page,
        "sort_type": "default",
        "search_type": 1,
        "type": 1
    }
    res_json = post_s(url=url, data=data)
    if res_json is {}:
        return []
    if res_json.get('result').get('page') == -1:
        goods_list_uniq_page = [i.get("goods_id") for i in res_json.get('result').get('goods_list')]
        goods_list_uniq_page.append(-1)
        return goods_list_uniq_page
    else:
        goods_list_uniq_page = [i.get("goods_id") for i in res_json.get('result').get('goods_list')]
        goods_list_uniq_page.append(1)
        return goods_list_uniq_page


def get_goods_list_uniq_subject(subject_id):
    goods_list = []
    for page in range(1, 100):
        goods_list_uniq_page = get_goods_list_uniq_page(subject_id=subject_id, page=page)
        time.sleep(random_second(10))
        if goods_list_uniq_page is []:
            break
        if goods_list_uniq_page[-1] == -1:
            goods_list.extend(goods_list_uniq_page[:-1])
            break
        else:
            goods_list.extend(goods_list_uniq_page[:-1])
    return goods_list


def refresh_gc_uniq_subject(date: str, subject_id):
    goods_list = get_goods_list_uniq_subject(subject_id)
    goods_list = {i for i in goods_list}
    goods_count = len(goods_list)
    name = str(subject_id) + date
    r.hset(name, "goods_count", goods_count)


def refresh_gc():
    name_list = r.keys()
    name_list = {name for name in name_list if date in name}
    name_list_1 = name_list.copy()
    for name in name_list:
        logger.info("剩余任务:{}".format(len(name_list_1)))
        value_list = list(r.hgetall(name).values())
        if "unknown" in value_list:
            refresh_gc_uniq_subject(date, value_list[0])
        name_list_1.remove(name)


def export_data(date: str, path):
    # date示例: _07_10
    cat_dict = {"111": "主推", "222": "上新", "1": "女装", "2": "男装", "3": "母婴", "4": "食品", "5": "内衣", "6": "家居家纺",
                "7": "鞋包配饰", "8": "美妆个护",
                "9": "运动", "10": "日用百货", "12": "数码3c", "333": "首发", "11": "更多"}
    path = path + "\\" + date[1:] + ".csv"
    headers = ["subject_id", "subject_title", "sku", "subject_cat", "subject_status"]
    rows = []
    name_list = r.keys()
    name_list = [i for i in name_list if date in i]
    for name in name_list:
        subject = r.hgetall(name)
        value_list = list(subject.values())
        value_list[3] = cat_dict[value_list[3]]
        rows.append(value_list)
    with open(path, 'w+', newline="")as f:
        f_csv = csv.writer(f)
        f_csv.writerow(headers)
        f_csv.writerows(rows)


get_subject_info()
refresh_gc()
export_data(date, r"C:\file\实习\东方证券 6.30\拼多多\数据")
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 212,542评论 6 493
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 90,596评论 3 385
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 158,021评论 0 348
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 56,682评论 1 284
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 65,792评论 6 386
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 49,985评论 1 291
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 39,107评论 3 410
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 37,845评论 0 268
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 44,299评论 1 303
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 36,612评论 2 327
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 38,747评论 1 341
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 34,441评论 4 333
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 40,072评论 3 317
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 30,828评论 0 21
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 32,069评论 1 267
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 46,545评论 2 362
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 43,658评论 2 350

推荐阅读更多精彩内容

  • 自由就是,你要承担得起过山车般起伏的人 我突然感觉哦,减肥减掉的不是体重,而是过往的一些错乱的认知和不恰当的习惯[...
    艾雪的简书阅读 1,570评论 0 0
  • 再忙也要把当下的工作检查好,不着急的可放一放但是回过头也一定要检查仔细,不要有遗漏的地方。
    天长地久盼三生阅读 125评论 0 0
  • 今天早晨和朋友聊天提起各自的爱豆。 自然而然提起黎明近期的演唱会。 提起他邀请林海峰和林日曦被很多路人骂,我刚说完...
    lixiyang阅读 324评论 0 0
  • 日更第66天 今天早上在思考🤔,我这段时间陪豆豆的时间其实真的不是太多,豆豆自从疱疹性咽夹炎提前15天放假之后,在...
    瓦顶那只猫阅读 120评论 0 1
  • 通常防盗链是为了防止图片,视频被盗用。下面简单介绍几种防盗链的机制: 利用HTTP Referer字段 HTTP请...
    平凡的雪夜冬天阅读 751评论 0 0