pdd_spider
# -*- coding: utf-8 -*-
import requests
import logging
from logger import logger
import time
import random
import json
import os
def write_json(filename, json_store):
with open(filename, 'w+') as file:
json.dump(json_store, file)
pass
def read_json(filename):
with open(filename) as file_obj:
data = json.load(file_obj)
return data
def post_s(url, data, headers=None, accesstoken=None):
if headers is None:
headers = {
'cache-control': 'no-cache',
'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36 MicroMessenger/7.0.9.501 NetType/WIFI MiniProgramEnv/Windows WindowsWechat',
'verifyauthtoken': '',
'accesstoken': 'KE56LAPGPJPR7CTVN73JOKNCEEQ6DVMFMBWFB4NFIDOSJELM76DQ113204e',
'code-version': '1.2.41',
'content-type': 'application/json;charset=UTF-8',
'p-appname': 'new_cps_shop',
'rfp': 'Qq6LZv8VJQfKT1420V0DAwbx6Mv58Gl3',
'referer': 'https://servicewechat.com/wxa7d57206dc1eaf51/194/page-frame.html',
'accept-encoding': 'gzip, deflate, br',
'content-length': '551'
}
else:
pass
if accesstoken is not None:
headers["accesstoken"] = accesstoken
else:
pass
i = 1
while i < 11:
res = requests.post(url=url, headers=headers, json=data)
try:
res_json = res.json()
except:
logger.warning("response 解析失败,开始第{}次重试".format(i))
logger.warning(res)
logger.warning(res.text)
logger.warning(url)
logger.warning(headers)
logger.warning(data)
i += 1
time.sleep(random_second(10))
continue
if res_json.get("success") is True:
logger.info("请求成功")
logger.info(res_json)
return res_json
else:
logger.warning("请求失败,原因见下方res_json,开始第{}次重试".format(i))
logger.warning(res_json)
logger.warning(url)
logger.warning(headers)
logger.warning(data)
i += 1
if res_json.get("error_code") is not None:
time.sleep(random_second(10))
else:
time.sleep(1)
continue
logger.warning("请求失败,程序停止,返回空结果")
return {}
def random_second(expect):
return round(random.uniform(expect-0.2,expect+0.2),2)
def get_subject_list():
def get_subject_list_uniq_page(cat_id, page):
url = "https://api.pinduoduo.com/api/brand-philips-salesman/subject/brand/list?xcx_version=1.2.41"
data = {
"open_app_source": 1007,
"wxapp_uuid": "5dwvfq9d803ynl8ru7fdbdrpzvtegyla",
"anti_content": "3eE4eFwV3FO-kcBAEaa99UTLK9o",
"biz_sn": "10_896510356",
"xcx_version": "1.2.41",
"cat_id": cat_id,
"subject_status": 1,
"page": page
}
res_json = post_s(url=url, data=data)
if res_json.get('result').get('page') == -1:
return [{"subject_id": i.get("subject_id"), "subject_title": i.get('title'),
"goods_count": i.get("goods_count"), "subject_cat": cat_id, "detailed_info": i} for i in res_json.get('result').get('subject_vos')].append(-1)
if res_json is None:
return []
else:
return [{"subject_id": i.get("subject_id"), "subject_title": i.get('title'),
"goods_count": i.get("goods_count"), "subject_cat": cat_id, "detailed_info": i} for i in res_json.get('result').get('subject_vos')].append(1)
subject_list = []
for cat_id in [111,222,1,2,3,4,5,6,7,8,9,10,11,12,333,11]:
for page in range(1,100):
time.sleep(random_second(2))
subject_list_uniq_page = get_subject_list_uniq_page(cat_id=cat_id, page=page)
if subject_list_uniq_page[-1] == -1:
subject_list.extend(subject_list_uniq_page[:-1])
break
else:
subject_list.extend(subject_list_uniq_page[:-1])
return subject_list
def get_goods_list(subject_id):
def get_goods_list_uniq_page(subject_id, page):
url = "https://api.pinduoduo.com/api/brand-philips-salesman/subject/goods/list?xcx_version=1.2.41"
data = {
"open_app_source": 1089,
"wxapp_uuid": "r4ucawh6mp8r0ksxsyhws9e4clqrsjkl",
"anti_content": "3akAfx5e-eCE65sa_4CM2-T-tPTdf_hSBsSRtVVE-eAmSklTMZ-zLk-0d5tTHWTpFSUck2EOYC5hy9oXqNFOTf4qH9sUbS_NjG4gJnqg4afcBwxkvfzkMPf5sEXj5CDYlwHPBOSe-sCEzeKeBFKE-3SEtxIkBeUk-sVeBwVeBeCeBnSEB6Kxj5vKJpHAp0MDmXPKFTGQTXiNaqlpQiC2dn5QmQZYQYZ4Kg8PYj0Sg_6CSswZvieE-KdAf2fU6nbUScWmdL1AELMDrVxe--bz7bK92xH_Cdo",
"biz_sn": "10_896510356",
"xcx_version": "1.2.41",
"subject_id": str(subject_id),
"size": 10,
"page": page,
"sort_type": "default",
"search_type": 1,
"type": 1
}
res_json = post_s(url=url, data=data)
if res_json.get('result').get('page') == -1:
goods_list_uniq_page = [
{"subject_id": subject_id, "goods_id": i.get("goods_id"), "goods_name": i.get('goods_name'),
"detailed_info": i} for i in res_json.get('result').get('goods_list')]
goods_list_uniq_page.append(-1)
return goods_list_uniq_page
if res_json is None:
return []
else:
goods_list_uniq_page = [
{"subject_id": subject_id, "goods_id": i.get("goods_id"), "goods_name": i.get('goods_name'),
"detailed_info": i} for i in res_json.get('result').get('goods_list')]
goods_list_uniq_page.append(-1)
return goods_list_uniq_page
goods_list = []
for page in range(1, 100):
time.sleep(random_second(2))
goods_list_uniq_page = get_goods_list_uniq_page(subject_id=subject_id, page=page)
if goods_list_uniq_page[-1] == -1:
goods_list.extend(goods_list_uniq_page[:-1])
break
else:
goods_list.extend(goods_list_uniq_page[:-1])
return goods_list
def get_sku_list(subject_id, goods_id):
url = "https://api.pinduoduo.com/api/brand-philips/goods/detail/extra/info?xcx_version=1.2.41"
data = {
"open_app_source": 1089,
"wxapp_uuid": "r4ucawh6mp8r0ksxsyhws9e4clqrsjkl",
"anti_content": "3akAfx5e-eCE650a_4CM2-T-tPTdf_oSBsSRtVVE-eAmSklTMZ-zLk-0d5tTOWTpFS-hk2EcYC5oy9I1_CxGqdNX_m6aet9_xk7f-zFfKW9knB4xlpOxQpdRBpiiQYNFBPP1kZMmpkEquF9YOpEaOpXycG_YnpXxnGXJsG7yXYPjXG4qX54qXY9qXDTaXG_5rB1pRMEN84zzkEwptUzACeL3VUm-Wr24SDBASDs-WMss_fcwM1BTPUl2Tl5gHRqxn_SN44pPlWaf2mIb3xdNxxQJJ05YnfaOIx_9tMe_FSI",
"biz_sn": "10_896510356",
"xcx_version": "1.2.41",
"goods_id": str(goods_id),
"use_ticket_of_wxapp": "",
"from_env": 27,
"subject_id": str(subject_id)
}
time.sleep(random_second(2))
res_json = post_s(url=url, data=data)
if res_json is None:
return -1
else:
return [{"subject_id": subject_id, "goods_id": i.get("goods_id"), "sku_id": i.get('sku_id'), "detailed_info": i} for i in res_json.get("result").get("sku_list")]
# -*- coding: utf-8 -*-
import csv
import time
import redis
from logger import logger
from pdd_spider import post_s, random_second
pool = redis.ConnectionPool(host='localhost', port=6379, decode_responses=True)
r = redis.Redis(connection_pool=pool)
date = time.strftime("_%m_%d", time.localtime())
def store_list_to_redis(name, list: list):
for i in list:
r.sadd(name, i)
def store_subject(subject_list_uniq_page):
for subject in subject_list_uniq_page:
temp = "unknown" if subject.get("goods_count") == None else subject.get("goods_count")
subject["goods_count"] = temp
subject_id = subject.get("subject_id")
name = str(subject_id) + date
logger.info(subject)
r.hmset(name, subject)
def get_subject_list_uniq_page(cat_id, page, subject_status):
url = "https://api.pinduoduo.com/api/brand-philips-salesman/subject/brand/list?xcx_version=1.2.41"
data = {
"open_app_source": 1007,
"wxapp_uuid": "5dwvfq9d803ynl8ru7fdbdrpzvtegyla",
"anti_content": "3akAfa5e-eCEG5Bo_09-ece7fwWMfizPZ8_7s4Hz3xw7L4W-eA_zCgVHTIto9zpZVGYdHPpmJPTGgdn5DQ2pXbBmUe-wUkzaZDBsUEBVckBFKkLV-eBeCk-3Se-FHEzF5kB2CD-WSavXqI5ePn_A4PEdbJj5QdXXh54CvDUFamIB2SSvfZeFfu0c27NDdeLV-eLVVytCEv-QFsFs-sACzMAPEE-wCeV2ZLl3dUveI3FKVDclmE9TDK4ATq",
"biz_sn": "10_896510356",
"xcx_version": "1.2.42",
"cat_id": cat_id,
"subject_status": subject_status,
"page": page
}
res_json = post_s(url=url, data=data)
if res_json is {}:
return []
if res_json.get('result').get('page') == -1:
subject_list = [{"subject_id": i.get("subject_id"), "subject_title": i.get('title'),
"goods_count": i.get("goods_count"), "subject_cat": cat_id, "subject_status": subject_status}
for i in
res_json.get('result').get('subject_vos')]
subject_list.append(-1)
return subject_list
if res_json is None:
return []
else:
subject_list = [{"subject_id": i.get("subject_id"), "subject_title": i.get('title'),
"goods_count": i.get("goods_count"), "subject_cat": cat_id, "subject_status": subject_status}
for i in
res_json.get('result').get('subject_vos')]
subject_list.append(1)
return subject_list
def get_subject_info():
cat_id_list = [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 12, 11]
# 111主推 222上新 1女装 2男装 3母婴 4食品 5内衣 6家居家纺 7鞋包配饰 8美妆个护 9运动 10日用百货 12数码3c 333首发 11更多
store_list_to_redis("cat_id_list", cat_id_list)
cat_id_list_remain = r.sdiff("cat_id_list", "cat_id_list_finish")
logger.info("剩余类目任务列表如下:")
logger.info(cat_id_list_remain)
for cat_id in cat_id_list_remain:
for subject_status in [0, 1]:
for page in range(1, 100):
logger.info("cat_id={}, page={}, subject_status={}".format(str(cat_id), str(page), str(subject_status)))
subject_list_uniq_page = get_subject_list_uniq_page(cat_id=cat_id, page=page,
subject_status=subject_status)
time.sleep(random_second(10))
if subject_list_uniq_page is []:
break
if subject_list_uniq_page[-1] == -1:
store_subject(subject_list_uniq_page[:-1])
break
else:
store_subject(subject_list_uniq_page[:-1])
r.sadd("cat_id_list_finish", cat_id)
def get_goods_list_uniq_page(subject_id, page):
url = "https://api.pinduoduo.com/api/brand-philips-salesman/subject/goods/list?xcx_version=1.2.41"
data = {
"open_app_source": 1089,
"wxapp_uuid": "r4ucawh6mp8r0ksxsyhws9e4clqrsjkl",
"anti_content": "3akAfx5e-eCE65sa_4CM2-T-tPTdf_hSBsSRtVVE-eAmSklTMZ-zLk-0d5tTHWTpFSUck2EOYC5hy9oXqNFOTf4qH9sUbS_NjG4gJnqg4afcBwxkvfzkMPf5sEXj5CDYlwHPBOSe-sCEzeKeBFKE-3SEtxIkBeUk-sVeBwVeBeCeBnSEB6Kxj5vKJpHAp0MDmXPKFTGQTXiNaqlpQiC2dn5QmQZYQYZ4Kg8PYj0Sg_6CSswZvieE-KdAf2fU6nbUScWmdL1AELMDrVxe--bz7bK92xH_Cdo",
"biz_sn": "10_896510356",
"xcx_version": "1.2.41",
"subject_id": str(subject_id),
"size": 10,
"page": page,
"sort_type": "default",
"search_type": 1,
"type": 1
}
res_json = post_s(url=url, data=data)
if res_json is {}:
return []
if res_json.get('result').get('page') == -1:
goods_list_uniq_page = [i.get("goods_id") for i in res_json.get('result').get('goods_list')]
goods_list_uniq_page.append(-1)
return goods_list_uniq_page
else:
goods_list_uniq_page = [i.get("goods_id") for i in res_json.get('result').get('goods_list')]
goods_list_uniq_page.append(1)
return goods_list_uniq_page
def get_goods_list_uniq_subject(subject_id):
goods_list = []
for page in range(1, 100):
goods_list_uniq_page = get_goods_list_uniq_page(subject_id=subject_id, page=page)
time.sleep(random_second(10))
if goods_list_uniq_page is []:
break
if goods_list_uniq_page[-1] == -1:
goods_list.extend(goods_list_uniq_page[:-1])
break
else:
goods_list.extend(goods_list_uniq_page[:-1])
return goods_list
def refresh_gc_uniq_subject(date: str, subject_id):
goods_list = get_goods_list_uniq_subject(subject_id)
goods_list = {i for i in goods_list}
goods_count = len(goods_list)
name = str(subject_id) + date
r.hset(name, "goods_count", goods_count)
def refresh_gc():
name_list = r.keys()
name_list = {name for name in name_list if date in name}
name_list_1 = name_list.copy()
for name in name_list:
logger.info("剩余任务:{}".format(len(name_list_1)))
value_list = list(r.hgetall(name).values())
if "unknown" in value_list:
refresh_gc_uniq_subject(date, value_list[0])
name_list_1.remove(name)
def export_data(date: str, path):
# date示例: _07_10
cat_dict = {"111": "主推", "222": "上新", "1": "女装", "2": "男装", "3": "母婴", "4": "食品", "5": "内衣", "6": "家居家纺",
"7": "鞋包配饰", "8": "美妆个护",
"9": "运动", "10": "日用百货", "12": "数码3c", "333": "首发", "11": "更多"}
path = path + "\\" + date[1:] + ".csv"
headers = ["subject_id", "subject_title", "sku", "subject_cat", "subject_status"]
rows = []
name_list = r.keys()
name_list = [i for i in name_list if date in i]
for name in name_list:
subject = r.hgetall(name)
value_list = list(subject.values())
value_list[3] = cat_dict[value_list[3]]
rows.append(value_list)
with open(path, 'w+', newline="")as f:
f_csv = csv.writer(f)
f_csv.writerow(headers)
f_csv.writerows(rows)
get_subject_info()
refresh_gc()
export_data(date, r"C:\file\实习\东方证券 6.30\拼多多\数据")