shein网站反爬逐渐升级,最近又新增了几个参数校验,现在需要破解的参数有anti-in, smdeviceid, armortoken, x-gw-auth,部分网页可能不需要每个参数都校验,但想获取全量数据,不破解这些参数是不行的。
列表页部分代码:
def treat_page(self, region, link, page):
base_url = f'https://{region.lower()}.shein.com/api/productList/info/get?'
if 'pdsearch' in link:
routeid_pattern = re.compile('pdsearch/(.*?)/')
else:
routeid_pattern = re.compile('-(\d+).html')
routeid = re.search(routeid_pattern, link).group(1)
if region in ['BR']:
_lange = 'pt-br'
elif region in ['PH', 'SG', 'US', 'MY']:
_lange = 'en'
elif region in ['TH']:
_lange = 'th'
else:
raise
param = {
'_ver': '1.1.8',
'_lang': _lange,
'type': 'selection' if 'RecommendSelection' in link else 'search' if 'search' in link else 'entity',
'routeId': routeid,
'page': page,
'reqSheinClub': 'true',
'isPaid': '0',
'source': 'sort',
'sourceStatus': '1',
'sort': '7',
'requestType': 'firstLoad' if page == 1 else 'pageChange'
}
try:
ici = re.search(self.ici_pattern, link).group(1)
param['ici'] = ici
except:
pass
try:
srctype = re.search(self.srctype_pattern, link).group(1)
param['srctype'] = srctype
except:
pass
try:
src_tab_page_id = re.search(self.src_tab_page_id_pattern, link).group(1)
param['src_tab_page_id'] = src_tab_page_id
except:
pass
try:
userpath = re.search(self.userpath_pattern, link).group(1)
param['userpath'] = userpath
except:
pass
try:
src_identifier = re.search(self.src_identifier_pattern, link).group(1)
param['src_identifier'] = src_identifier
except:
pass
try:
adp = re.search(self.adp_pattern, link).group(1)
param['adp'] = adp
except:
pass
try:
child_cat_id = re.search(self.child_cat_id_pattern, link).group(1)
param['child_cat_id'] = child_cat_id
except:
pass
try:
src_module = re.search(self.src_module_pattern, link).group(1)
param['src_module'] = src_module
except:
pass
if 'RecommendSelection' in link:
param['sub_type'] = 'RecommendSelection'
param['categoryJump'] = 'true'
param['src_module'] = 'topcat'
page_url = base_url + urlencode(param).replace('%25', '%').replace('%20', '+')
# print('page_url:', page_url)
# headers = self.headers.copy()
if re.findall('page=', link):
referer = link.split('&page')[0] + f'&page={page}'
else:
if link.endswith('html') or link.endswith('/'):
referer = link + f'?page={page}&sort=7&source=sort&sourceStatus=1'
else:
referer = link + f'&page={page}&sort=7&source=sort&sourceStatus=1'
# headers['Referer'] = referer
path = page_url.replace('https://th.shein.com', '')
ua = UserAgent()
ii = 0
while ii < 5:
random_user_agent = ua.random
device = gen_device(region, referer, path, random_user_agent, site_type='web')
# print(device)
headers = {
'accept': 'application/json, text/plain, */*',
'accept-encoding': 'gzip, deflate, br',
'accept-language': 'zh-CN,zh;q=0.9',
"referer": referer,
"anti-in": device.get("msg", {}).get("headers", {}).get("anti-in"),
"smdeviceid": device.get("msg", {}).get("headers", {}).get("smdeviceid"),
"armortoken": device.get("msg", {}).get("headers", {}).get("armortoken"),
"x-gw-auth": device.get("msg", {}).get("headers", {}).get("x-gw-auth"),
"user-agent": random_user_agent,
"x-requested-with": "XMLHttpRequest",
'uber-trace-id': 'ffae1d5fbd3ecdba:ffae1d5fbd3ecdba:0:0',
'x-csrf-token': 'CmT59WLJ-n7-tWFF--_58eJMNbbrPx6qlMFk'
}
try:
# response = requests.get(page_url, headers=headers, proxies=self.proxies, timeout=30)
response = requests.get(page_url, headers=headers, cookies=device.get("msg", {}).get("cookies", {}),
proxies=self.proxies, timeout=20, verify=False)
# print(response.text)
except Exception as e:
ii += 1
# print(f'try {ii} times,{e.args}')
continue
try:
response = response.json()
# print(f'page:{page}', response)
except:
# print(f'try {ii} times, ')
ii += 1
continue
if not response:
ii += 1
# print(f'request page, try {x} times')
continue
else:
print(f'page:{page} success')
break
else:
raise Exception(page_url)
items = response['goods']
urls = []
for item in items:
# 两种构造方式都可?
# url = f'https://{region.lower()}.shein.com/{item["goods_url_name"]}-p-{item["goods_id"]}-cat-{item["cat_id"]}.html'
url = f'https://{region.lower()}.shein.com/{item["goods_url_name"].replace(" ", "-")}-p-{item["goods_id"]}.html' # + urlencode(para) # 2024.7.22详情页链接构造发生变化
urls.append(url)
l = len(urls)
print('urls:', l, urls[:1])
exector = ThreadPoolExecutor(max_workers=40)
for index, url in enumerate(urls):
# self.request_detail(url, region, referer)
# break
exector.submit(self.request_detail, url, region, referer).add_done_callback(self.parse_exception)
exector.shutdown(wait=True)
详情页
import re
import time
import requests
from urllib.parse import urlencode
import urllib3
urllib3.disable_warnings()
from shein_utils import gen_device
from fake_useragent import UserAgent
class SheinDetail:
def __init__(self):
self.product_data_pattern = re.compile('window.gbRawData = (.*?)\n')
self.proxies = {
}
def detail(self, detail_url, region):
"""详情页"""
t = re.search(re.compile('shein.com/(.*?)\.html'), detail_url).group(1)
referer = f'https://m.shein.com/{region.lower()}/{t}.html?ref={region.lower()}&rep=dir&ret=m{region.lower()}' # ref=th&rep=dir&ret=mth' #
p = re.compile('-(\d+)\.html')
goods_id = re.search(p, referer).group(1)
# print(referer)
# print(goods_id)
if region in ['BR']:
_lange = 'pt-br'
elif region in ['PH', 'SG', 'US', 'MY']:
_lange = 'en'
elif region in ['TH']:
_lange = 'th'
else:
raise
currency_map = {
'BR': 'BRL',
'PH': 'PHP',
'TH': 'THB',
'SG': 'SGD',
'MY': 'MYR',
'US': 'USD'
}
url = f"https://m.shein.com/{region.lower()}/api/productInfo/productDetail/get"
params = {
"currency": currency_map[region], # "BRL", #"THB", # SGD",
"fromSpa": "1",
"goods_id": str(goods_id),
"imgRatio": "3-4",
"mallCode": "1",
"showFeedbackRec": "1",
"template": "0",
"version": "1.0.4",
"withI18n": "0",
"_ver": "1.1.8",
"_lang": _lange #"th", #"en"
}
path = url.replace('https://m.shein.com', '') + "?" + urlencode(params)
# print('path:', path)
ua = UserAgent()
x = 0
while x < 15:
random_user_agent = ua.random
device = gen_device(region, referer, path, random_user_agent)
# print(device)
headers = {
"referer": referer,
"anti-in": device.get("msg", {}).get("headers", {}).get("anti-in"),
"smdeviceid": device.get("msg", {}).get("headers", {}).get("smdeviceid"),
"armortoken": device.get("msg", {}).get("headers", {}).get("armortoken"),
"x-gw-auth": device.get("msg", {}).get("headers", {}).get("x-gw-auth"),
"user-agent": random_user_agent,
"x-requested-with": "XMLHttpRequest",
}
# print(headers)
try:
response = requests.get(url, headers=headers, cookies=device.get("msg", {}).get("cookies", {}),
proxies=self.proxies, params=params, timeout=20, verify=False)
except Exception as e:
x += 1
time.sleep(3)
# print(22, e.args)
continue
# print(response.status_code, response.text)
# print(response.status_code == 403)
if response.status_code == 403 or 'goods not exist' in response.text or 'code":"836100"' in response.text:
x += 1
time.sleep(3)
# print(22)
continue
else:
break
else:
# raise Exception(detail_url)
return []
try:
modules = response.json()['modules']
except:
return [] # goods not exist
# print(11, modules)
return modules
if __name__ == '__main__':
s = SheinDetail()
s.detail('https://us.shein.com/SHEIN-Comfortcana-4pcs-Set-Striped-Leopard-Print-Solid-Casual-Slim-Fit-Cami-Top-Summer-p-33543404.html', 'US')
评论页
def comment_page(self, page, product_relation_id, region, store_code, cat_id, sku_id):
if region in ['BR', 'PH', 'SG', 'TH', 'US', 'MY']:
base_url = f'https://m.shein.com/{region.lower()}/bff-api/product/get_goods_review_detail?'
else:
raise
if region in ['BR']:
_lange = 'pt-br'
elif region in ['PH', 'SG', 'US', 'MY']:
_lange = 'en'
elif region in ['TH']:
_lange = 'th'
else:
raise
para = {
'_ver': '1.1.8',
'_lang': _lange,
'comment_rank': '',
'goods_id': '',
'goods_spu': product_relation_id,
'is_picture': '',
'local_site_abt_flag': '',
'local_site_query_flag': '',
'size': '',
'sku': sku_id,
'sort': 'time_desc',
'store_code': store_code,
'tag_id': '',
'tag_rule_id': 'type=B',
'store_comment_flag': '1',
'isLowestPriceProductOfBuyBox': '0',
'mainProductSameGroupId': '',
'page': page,
'cat_id': cat_id,
}
url = base_url + urlencode(para)
x = 0
while x < 20:
try:
response = requests.get(url, headers=self.comment_headers, proxies=self.proxies, timeout=30)
# print(response.text)
except:
x += 1
# print(f'try {x} times') # , detail_url:{detail_url}
antiin = self.gen_antiin()
self.comment_headers['anti-in'] = antiin
continue
try:
res = response.json()
# print('res:', res)
except:
# print(response.text)
x += 1
# print(f'try {x} t imes') # , detail_url:{detail_url}
antiin = self.gen_antiin()
self.comment_headers['anti-in'] = antiin
cookie_jar = response.cookies.items()
cookie_jar = [i[0] + '=' + i[1] for i in cookie_jar]
cookie = ';'.join(cookie_jar)
self.comment_headers['cookie'] = cookie
continue
if not res or res['msg'] != 'ok':
x += 1
# print(f'request comment, try {x} times')
antiin = self.gen_antiin()
self.comment_headers['anti-in'] = antiin
continue
else:
# print(res)
return res
else:
return None
def comment(self, product_relation_id, region, store_code, cat_id, sku_id):
"""
获取最近7天的评价数、好评数
点击进入商品页面
拉到下方评论页面,选择location rating (本地评论), sort by 最新评论
记录7天内的评论数 (评论有显示时间戳)
记录7天内的好评数(>=4星为好评)
@return:
"""
# print('product_relation_id:', product_relation_id)
comment_num_7, comment_good_num_7 = 0, 0
page = 1
while page < 75:
res = self.comment_page(page, product_relation_id, region, store_code, cat_id, sku_id)
# print('res:', res)
if not res:
return comment_num_7, comment_good_num_7
comments = res['info']['comment_info']
# print('comments:', len(comments))
if not comments:
return comment_num_7, comment_good_num_7
for comment in comments:
comment_time = int(comment['add_time'])
within7 = self.within_gap(comment_time)
if not within7:
return comment_num_7, comment_good_num_7
comment_num_7 += 1
star = int(comment['comment_rank'])
if star >= 4:
comment_good_num_7 += 1
page += 1
这里主要列出了shein爬虫的主干代码,涉及细节部分欢迎私聊。
欢迎咨询v 18918051863