案例
搜索商品内容,对数据进行爬取
1701525693273.png
1701525789697.png
请求需要携带请求参数,分页是改变请求中data的pnum值
第一个版本:
url = 'https://h5api.m.taobao.com/h5/mtop.alimama.union.xt.en.api.entry/1.0/'
# 加入请求参数,直接复制粘贴请求中的数据
params = {
'jsv':'2.5.1',
'appKey':'12574478',
't':'1701525024723',
'sign':'a78a89e5d7e2c932cd697b06a097d680',
'api':'mtop.alimama.union.xt.en.api.entry',
'v':'1.0',
'AntiCreep':'true',
'timeout':'20000',
'AntiFlood':'true',
'type':'jsonp',
'dataType':'jsonp',
'callback':'mtopjsonp2',
'data':'{"pNum":4,"pSize":"60","refpid":"mm_26632258_3504122_32538762","variableMap":"{\"q\":\"电脑\",\"navigator\":false,\"clk1\":\"64bf5020531f085821559b8403613ee8\",\"union_lens\":\"recoveryid:201_33.5.38.198_4517374_1701518614153;prepvid:201_33.102.193.213_4541515_1701518623554\",\"recoveryId\":\"201_33.60.246.239_4546060_1701525022972\"}","qieId":"36308","spm":"a2e0b.20350158.31919782","app_pvid":"201_33.60.246.239_4546060_1701525022972","ctm":"spm-url:a2e0b.20350158.31919782.1;page_url:https%3A%2F%2Fuland.taobao.com%2Fsem%2Ftbsearch%3Frefpid%3Dmm_26632258_3504122_32538762%26keyword%3D%25E7%2594%25B5%25E8%2584%2591%26clk1%3D64bf5020531f085821559b8403613ee8%26upsId%3D64bf5020531f085821559b8403613ee8%26spm%3Da2e0b.20350158.31919782.1%26pid%3Dmm_26632258_3504122_32538762%26union_lens%3Drecoveryid%253A201_33.5.38.198_4517374_1701518614153%253Bprepvid%253A201_33.102.193.213_4541515_1701518623554%26pnum%3D4"}'
}
res = requests.get(url,params=params)
print(res.text())
失败原因:加伪装,把请求头中重要的带上(伪装的信息自己去浏览器中复制,包括请求信息)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
,
'cookie':'sgcookie=E100K4UGkrJCGZP7iQSBbtQNUH4EnNQTw8zcDamfRuXtDpl2XAPoKGr%2B2Eo8xxkuwL03ZzGhl9VXAX3o6kcnP727X3xdHLYUM%2F2mqpdxB%2FBjgtn1fDgNywQqXSPw3IwwXXSh; thw=cn; tracknick=; cna=NHNaHU0UoS4CAW8W/OUOTUkR; ariaDefaultTheme=undefined; t=17557dbc02546390616f3b6a0f763749; xlly_s=1; _samesite_flag_=true; cookie2=17c389ff42a7da99b7cd8218a90d2eaf; _tb_token_=5eb7e953d73ee; _m_h5_tk=604b3901895d6514eb054f5dd7dd45f6_1701534374436; _m_h5_tk_enc=1bb8d474afa13c2a672f14fbcde0394d; x5secdata=xd3bf2de55f2e9dedf6a95fe832b34c5922cafa7eb57691bac1701525023a-717315356a-1061363021abaad2eaa__bx__h5api.m.taobao.com%3A443%2Fh5%2Fmtop.alimama.union.xt.en.api.entry%2F1.0; tfstk=eiepA2OQqNbHcnLqs9CG41pQ-SIGiJEeByrXE40HNPUTmPsEZarSVYU_rMu3Vwz8eVZztXDIrb37oznuK828_LE44JjEEMkUYYk5oZX0ykrEU87GB2BcRS__KZbcnOfBfYQdoW4nVlI3E1WHLQpaQ8mCMAjz3-f03DhKyL09eFsoAXnTvVQflRBq9qZIWLsy8-2vd-Ymk0ACXGdyaXiNFhVufOvBitmtoMBJaQlNbc3cXGdyaXiZXqjd2QRr_G5..; l=fBMoFqsgNqIOPd1NBOfwourza77OAIRfguPzaNbMi9fPOY1H5FYVW1EgdMTMCnGVesaXR3RxBjFXBc850ydVV4wH6b293O3I3dhyN3pR.; isg=BLe3WWXanuVjoBo-VkRW9LTxRqsBfIve2IjgfAlk_AbtuNb6EUhOLjGemhjmUGNW' }
res = requests.get(url,headers=headers,params=params)
问题:这样只能固定拿某一页的数据,不灵活,所以分析,每次请求中有什么不一样
1701526106350.png
1701526124848.png
通过比较,不同页面的sign和t不一样,所以需要知道这两个是怎么来的
分析加密
1701526175979.png
1701526203046.png
1701526285687.png
打断点依次执行
通过浏览器发现,每页的请求参数 t 和sign会变
i = (new Date).getTime()
t = i 拿到的是一个时间戳
j = h(d.token + "&" + i + "&" + g + "&" + c.data)
sign = j sign拿到的是h函数的返回值
假设(伪造数据)
d.token = 'a' "1136a8c051f1a5364f0f32372d7f8b4b" 其实来源于请求中的cookie中某个数据
i = 17000000 1701520391063
g = 123 12574478
c.data = 111 在请求携带的参数中data对应值
d.token + "&" + i + "&" + g + "&" + c.data 调用h函数,把这些数据拼在一起然后作为实参
'a&17000000&123&111'
h函数是一个md5加密函数,传入的参数
d.token + "&" + i + "&" + g + "&" + c.data
h(1136a8c051f1a5364f0f32372d7f8b4b&1701520391063&12574478&{"pNum":4,"pSize":"60","refpid":"mm_26632258_3504122_32538762","variableMap":"{\"q\":\"电脑\",\"navigator\":false,\"clk1\":\"64bf5020531f085821559b8403613ee8\",\"union_lens\":\"recoveryid:201_33.5.38.198_4517374_1701518614153;prepvid:201_33.102.193.213_4541515_1701518623554\",\"recoveryId\":\"201_33.103.67.168_4539825_1701519809850\"}","qieId":"36308","spm":"a2e0b.20350158.31919782","app_pvid":"201_33.103.67.168_4539825_1701519809850","ctm":"spm-url:a2e0b.20350158.31919782.1;page_url:https%3A%2F%2Fuland.taobao.com%2Fsem%2Ftbsearch%3Frefpid%3Dmm_26632258_3504122_32538762%26keyword%3D%25E7%2594%25B5%25E8%2584%2591%26clk1%3D64bf5020531f085821559b8403613ee8%26upsId%3D64bf5020531f085821559b8403613ee8%26spm%3Da2e0b.20350158.31919782.1%26pid%3Dmm_26632258_3504122_32538762%26union_lens%3Drecoveryid%253A201_33.5.38.198_4517374_1701518614153%253Bprepvid%253A201_33.102.193.213_4541515_1701518623554%26pnum%3D4"})
返回请求中的sign值
第二个版本
通过分析,t和sign值不一样,t是时间戳(1970年1月1日0点0时0分到现在这一刻的毫秒值)
通过python生成
print(time.time())
# python生成的值:1701521428.9552007
# js生成的值: 1701521524254
# 格式不一样,多了三位整数
print(time.time()*1000) # 1701521488188.6692 去掉小数,转为整数类型,格式一致
print(int(time.time()*1000))
sign值为md5加密,其中几个值是固定的
1701526575971.png
def encryption():
# 调用md5加密 传入d.token + "&" + i + "&" + g + "&" + c.data 的字符串
token = '1136a8c051f1a5364f0f32372d7f8b4b' # 从cookie中复制的
i = str(int(time.time()*1000))
g = '12574478' # appkey,是固定的
data = '{"pNum":4,"pSize":"60","refpid":"mm_26632258_3504122_32538762","variableMap":"{\"q\":\"电脑\",\"navigator\":false,\"clk1\":\"64bf5020531f085821559b8403613ee8\",\"union_lens\":\"recoveryid:201_33.5.38.198_4517374_1701518614153;prepvid:201_33.102.193.213_4541515_1701518623554\",\"recoveryId\":\"201_33.102.193.213_4552265_1701521689921\"}","qieId":"36308","spm":"a2e0b.20350158.31919782","app_pvid":"201_33.102.193.213_4552265_1701521689921","ctm":"spm-url:a2e0b.20350158.31919782.1;page_url:https%3A%2F%2Fuland.taobao.com%2Fsem%2Ftbsearch%3Frefpid%3Dmm_26632258_3504122_32538762%26keyword%3D%25E7%2594%25B5%25E8%2584%2591%26clk1%3D64bf5020531f085821559b8403613ee8%26upsId%3D64bf5020531f085821559b8403613ee8%26spm%3Da2e0b.20350158.31919782.1%26pid%3Dmm_26632258_3504122_32538762%26union_lens%3Drecoveryid%253A201_33.5.38.198_4517374_1701518614153%253Bprepvid%253A201_33.102.193.213_4541515_1701518623554%26pnum%3D4"}'
# 调用加密函数
h_str = token + "&" + i + "&" + g+ "&" +data
sign = hashlib.md5(h_str.encode()).hexdigest()
return sign
# 调用加密函数,拿到加密之后sign值
sign_data = encryption()
完整代码
'''
需求:
通过关键字搜索,抓取响应数据
1- 找到目标url
2- 发请求,需要带上加密参数(t,sign)
3- 获取响应
4- 解析数据
'''
import hashlib
import time
# 定义一个加密函数
def encryption(params_data,time1):
# 调用md5加密 传入d.token + "&" + i + "&" + g + "&" + c.data 的字符串
token = '1136a8c051f1a5364f0f32372d7f8b4b'
# i = str(int(time.time()*1000))
# print('i',i)
g = '12574478' # appkey,是固定的
# data = '{"pNum":4,"pSize":"60","refpid":"mm_26632258_3504122_32538762","variableMap":"{\"q\":\"电脑\",\"navigator\":false,\"clk1\":\"64bf5020531f085821559b8403613ee8\",\"union_lens\":\"recoveryid:201_33.5.38.198_4517374_1701518614153;prepvid:201_33.102.193.213_4541515_1701518623554\",\"recoveryId\":\"201_33.102.193.213_4552265_1701521689921\"}","qieId":"36308","spm":"a2e0b.20350158.31919782","app_pvid":"201_33.102.193.213_4552265_1701521689921","ctm":"spm-url:a2e0b.20350158.31919782.1;page_url:https%3A%2F%2Fuland.taobao.com%2Fsem%2Ftbsearch%3Frefpid%3Dmm_26632258_3504122_32538762%26keyword%3D%25E7%2594%25B5%25E8%2584%2591%26clk1%3D64bf5020531f085821559b8403613ee8%26upsId%3D64bf5020531f085821559b8403613ee8%26spm%3Da2e0b.20350158.31919782.1%26pid%3Dmm_26632258_3504122_32538762%26union_lens%3Drecoveryid%253A201_33.5.38.198_4517374_1701518614153%253Bprepvid%253A201_33.102.193.213_4541515_1701518623554%26pnum%3D4"}'
# 调用加密函数
h_str = token + "&" + time1 + "&" + g+ "&" + str(params_data)
sign = hashlib.md5(h_str.encode()).hexdigest()
return sign
# 调用加密函数,拿到加密之后sign值
params_data = {"pNum":0,"pSize":"60","refpid":"mm_26632258_3504122_32538762","variableMap":"{\"q\":\"电脑\",\"navigator\":false,\"clk1\":\"64bf5020531f085821559b8403613ee8\",\"union_lens\":\"recoveryid:201_33.5.38.198_4517374_1701518614153;prepvid:201_33.102.193.213_4541515_1701518623554\",\"recoveryId\":\"201_33.43.43.30_4522654_1701519190042\"}","qieId":"36308","spm":"a2e0b.20350158.31919782","app_pvid":"201_33.43.43.30_4522654_1701519190042","ctm":"spm-url:a2e0b.20350158.31919782.1;page_url:https%3A%2F%2Fuland.taobao.com%2Fsem%2Ftbsearch%3Frefpid%3Dmm_26632258_3504122_32538762%26keyword%3D%25E7%2594%25B5%25E8%2584%2591%26clk1%3D64bf5020531f085821559b8403613ee8%26upsId%3D64bf5020531f085821559b8403613ee8%26spm%3Da2e0b.20350158.31919782.1%26pid%3Dmm_26632258_3504122_32538762%26union_lens%3Drecoveryid%253A201_33.5.38.198_4517374_1701518614153%253Bprepvid%253A201_33.102.193.213_4541515_1701518623554%26pnum%3D2"}
time1 = str(int(time.time()*1000)) # 时间戳
sign_data = encryption(params_data,time1) # 调用加密函数,传入data和time
url = 'https://h5api.m.taobao.com/h5/mtop.alimama.union.xt.en.api.entry/1.0/'
# 加入请求参数
params = {
'jsv':'2.5.1',
'appKey':'12574478',
't': time1, #因为这个值和js的值格式不一样,并且取整
'sign': sign_data, # 加密值
'api':'mtop.alimama.union.xt.en.api.entry',
'v':'1.0',
'AntiCreep':'true',
'timeout':'20000',
'AntiFlood':'true',
# 如果加上它们,返回的数据格式是带上了mtopjsonp2({})
#'type':'jsonp',
#'dataType':'jsonp',
#'callback':'mtopjsonp2',
'data': str(params_data)
}
headers = {
'Accept':'*/*',
'Accept-Encoding':'gzip, deflate, br',
'Accept-Language':'zh-CN,zh;q=0.9,en;q=0.8',
'Cookie':'sgcookie=E100K4UGkrJCGZP7iQSBbtQNUH4EnNQTw8zcDamfRuXtDpl2XAPoKGr%2B2Eo8xxkuwL03ZzGhl9VXAX3o6kcnP727X3xdHLYUM%2F2mqpdxB%2FBjgtn1fDgNywQqXSPw3IwwXXSh; thw=cn; tracknick=; cna=NHNaHU0UoS4CAW8W/OUOTUkR; ariaDefaultTheme=undefined; t=17557dbc02546390616f3b6a0f763749; _m_h5_tk=1136a8c051f1a5364f0f32372d7f8b4b_1701524579905; _m_h5_tk_enc=7a65467d291dbcbb439da7e1dd4445fa; xlly_s=1; _samesite_flag_=true; cookie2=17c389ff42a7da99b7cd8218a90d2eaf; _tb_token_=5eb7e953d73ee; x5secdata=xd83ca9e6fb21e1b262d0f34c86584cba190c2985c611f2c8f1701518730a-717315356a-1061363021abaad2eaa__bx__h5api.m.taobao.com%3A443%2Fh5%2Fmtop.alimama.union.xt.en.api.entry%2F1.0; tfstk=evHDAs9rsjPXsGBI2qeXFrex6WA8hZw_-VBTWRUwazz5MRdjXlcoAV4vfIwxZP0rrrWvcm3ubVmsMnN9lGjjE2XOHEOjbO2TQeLpppnXcRwwJ40ptBofm3UrfppKcmSY0HnypnTiHLsh5MvYSZMvmWUhYJTfE_t_tyo4umnxUy_L-mr1IO40mSAx0zXNQYjrR_5FVxB_4hHP11N4Vu4LQsWwIkfVfQtkq6ObguZvJ3xl11N4Vu4pq3fClSr7DeC..; l=fBMoFqsgNqIOPW-wBOfwFurza77OQIRfguPzaNbMi9fP9M5p52S1W1EgzjT9CnGVesCMR3RxBjFXBmY5mydVV4wH6b293O3j3dhyN3pR.; isg=BB8fIL7Rhr1ghIL2Dkw-vGxprnOphHMmMACYNLFspU4UQD7CuVBfdtgWAtA-WEue',
'Referer':'https://uland.taobao.com/',
'Sec-Ch-Ua':'"Google Chrome";v="119", "Chromium";v="119", "Not?A_Brand";v="24"',
'Sec-Ch-Ua-Mobile':'?0',
'Sec-Ch-Ua-Mobile':'?0',
'Sec-Ch-Ua-Platform':'"Windows"',
'Sec-Fetch-Dest':'script',
'Sec-Fetch-Mode':'no-cors',
'Sec-Fetch-Site':'same-site',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36',
}
import requests
# 发起请求
res = requests.get(url,headers=headers,params=params)
# 获取数据
res_data = res.json()
print(res_data)
for i in res_data['data']['recommend']['resultList']:
# 循环取出每页数据
print(i['itemName'],i['shopTitle'],i['monthSellCount'],i['price'])
第三个版本(加上分页)
分页请求中是请求参数的pnum发生改变,可以使用for循环完成,也可以使用递归(回调函数)完成。
使用递归,需要构造函数,所以用了之前的类方式
import hashlib
import time
import requests
class MySpider:
def __init__(self):
# 初始页数为0
self.num = 0
self.url = 'https://h5api.m.taobao.com/h5/mtop.alimama.union.xt.en.api.entry/1.0/' # 实例属性
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/118.0.0.0 Safari/537.36'
,'cookie':'sgcookie=E100K4UGkrJCGZP7iQSBbtQNUH4EnNQTw8zcDamfRuXtDpl2XAPoKGr%2B2Eo8xxkuwL03ZzGhl9VXAX3o6kcnP727X3xdHLYUM%2F2mqpdxB%2FBjgtn1fDgNywQqXSPw3IwwXXSh; thw=cn; tracknick=; cna=NHNaHU0UoS4CAW8W/OUOTUkR; ariaDefaultTheme=undefined; t=17557dbc02546390616f3b6a0f763749; xlly_s=1; _samesite_flag_=true; cookie2=17c389ff42a7da99b7cd8218a90d2eaf; _tb_token_=5eb7e953d73ee; _m_h5_tk=604b3901895d6514eb054f5dd7dd45f6_1701534374436; _m_h5_tk_enc=1bb8d474afa13c2a672f14fbcde0394d; x5secdata=xd3bf2de55f2e9dedf6a95fe832b34c5922cafa7eb57691bac1701525023a-717315356a-1061363021abaad2eaa__bx__h5api.m.taobao.com%3A443%2Fh5%2Fmtop.alimama.union.xt.en.api.entry%2F1.0; tfstk=eiepA2OQqNbHcnLqs9CG41pQ-SIGiJEeByrXE40HNPUTmPsEZarSVYU_rMu3Vwz8eVZztXDIrb37oznuK828_LE44JjEEMkUYYk5oZX0ykrEU87GB2BcRS__KZbcnOfBfYQdoW4nVlI3E1WHLQpaQ8mCMAjz3-f03DhKyL09eFsoAXnTvVQflRBq9qZIWLsy8-2vd-Ymk0ACXGdyaXiNFhVufOvBitmtoMBJaQlNbc3cXGdyaXiZXqjd2QRr_G5..; l=fBMoFqsgNqIOPd1NBOfwourza77OAIRfguPzaNbMi9fPOY1H5FYVW1EgdMTMCnGVesaXR3RxBjFXBc850ydVV4wH6b293O3I3dhyN3pR.; isg=BLe3WWXanuVjoBo-VkRW9LTxRqsBfIve2IjgfAlk_AbtuNb6EUhOLjGemhjmUGNW' }
def encryption(self,params_data, time1):
# 调用md5加密 传入d.token + "&" + i + "&" + g + "&" + c.data 的字符串
token = '604b3901895d6514eb054f5dd7dd45f6'
# i = str(int(time.time()*1000))
# print('i',i)
g = '12574478' # appkey,是固定的
# 调用加密函数
h_str = token + "&" + time1 + "&" + g + "&" + str(params_data)
sign = hashlib.md5(h_str.encode()).hexdigest()
return sign
def get_response(self):
time1 = str(int(time.time()*1000))
params_data = {"pNum": self.num, "pSize": "60", "refpid": "mm_26632258_3504122_32538762",
"variableMap": "{\"q\":\"电脑\",\"navigator\":false,\"clk1\":\"64bf5020531f085821559b8403613ee8\",\"union_lens\":\"recoveryid:201_33.5.38.198_4517374_1701518614153;prepvid:201_33.102.193.213_4541515_1701518623554\",\"recoveryId\":\"201_33.43.43.30_4522654_1701519190042\"}",
"qieId": "36308", "spm": "a2e0b.20350158.31919782",
"app_pvid": "201_33.43.43.30_4522654_1701519190042",
"ctm": "spm-url:a2e0b.20350158.31919782.1;page_url:https%3A%2F%2Fuland.taobao.com%2Fsem%2Ftbsearch%3Frefpid%3Dmm_26632258_3504122_32538762%26keyword%3D%25E7%2594%25B5%25E8%2584%2591%26clk1%3D64bf5020531f085821559b8403613ee8%26upsId%3D64bf5020531f085821559b8403613ee8%26spm%3Da2e0b.20350158.31919782.1%26pid%3Dmm_26632258_3504122_32538762%26union_lens%3Drecoveryid%253A201_33.5.38.198_4517374_1701518614153%253Bprepvid%253A201_33.102.193.213_4541515_1701518623554%26pnum%3D2"}
params = {
'jsv': '2.5.1',
'appKey': '12574478',
't': time1, # 因为这个值和js的值格式不一样,并且取整
'sign': self.encryption(params_data,time1),
'api': 'mtop.alimama.union.xt.en.api.entry',
'v': '1.0',
'AntiCreep': 'true',
'timeout': '20000',
'AntiFlood': 'true',
'data': str(params_data)
}
response = requests.get(self.url,headers=self.headers,params=params)
return response
def parse(self):
response = self.get_response()
# 对页数加1
self.num+=1
res_data = response.json()
# 如果当前这一次请求有数据,就进行解析,并且再重新发起请求
if len(res_data['data']) != 0:
print(f'当前是第{self.num}页')
for i in res_data['data']['recommend']['resultList']:
print(i['itemName'], i['shopTitle'], i['monthSellCount'], i['price'])
# 设置回调(如果没有数据,就不需要再继续调用了)
self.parse()
if __name__ == '__main__':
MySpider().parse()