def parse_detail(self, response):
# 1.正常页面, 2.503动物页面(a.重新请求,b.换代理), 3.robot页面(检查captcha)
product_id = response.meta['product_id']
record_date = response.meta['record_date']
category_id = response.meta['category_id']
platform = response.meta['platform']
rank_type = response.meta['rank_type']
detail_link = response.meta['detail_link']#验证码时改动
try_times = response.meta['try_times']if 'try_times' in response.metaelse 0
try_times_dw = response.meta['try_times_dw']if 'try_times_dw' in response.metaelse 0
self.log(product_id)
sel = Selector(response)
price_ele = sel.xpath("//div[@id='snsPrice']//span[contains(@class,'a-color-price')]//text()").extract()
if not price_ele:
price_ele = sel.xpath("//div[@id='unqualified']//a[contains(@href,'"+product_id+"')]/following-sibling::span[contains(@class,'a- color- price')]/text()").extract()
if not price_ele:
price_ele = sel.xpath("//span[@id='priceblock_ourprice']/text()").extract()
if not price_ele:
price_ele = sel.xpath("//div[@id='olp_feature_div']//span//text()").extract()
# 出现机器人页面时:
captcha_img = sel.xpath("//form[contains(@action,'validateCaptcha')]//img[contains(@src,'captcha')]/@src")
if captcha_img:
# 1.验证码成功->正常页面,2.验证码失败->robot页面,3.N次尝试失败退出(换代理)
if int(try_times) >=3:
yield Request(url=detail_link,
callback=self.parse_detail,
meta={"product_id": product_id,"record_date": record_date,
"category_id":self.category_id,"platform":'amazon',"rank_type":'category',
'detail_link': detail_link,'proxy_renew':True})
else:
try_times =str(int(try_times) +1)
img_url = captcha_img
response = requests.get(img_url)
img = response.content
with open('./a.jpg','wb')as f:
f.write(img)
imgapi_url ='http://192.168.6.180:8081/api/solve'
files = {'image': ('test.png',open('a.jpg','rb'),'image/png')}
res1 = requests.post(imgapi_url,files=files)
security_dict = res1.json()
security = security_dict['code']
self.log(security)
amzn = sel.xpath("//input[@name='amzn']/@value")
url_parsed = urlparse(detail_link)
amzn_r = (url_parsed.path +'?' + url_parsed.query)
data = {'amzn': amzn,'amzn-r': amzn_r,'field-keywords': security}
# https://www.amazon.com/errors/validateCaptcha
yield FormRequest('https://www.amazon.com/errors/validateCaptcha',formdata=data,callback=self.parse_detail,method='get',
meta={'try_times':try_times,'detail_link':detail_link},dont_filter=True)
dongwu_ele = sel.xpath("//div[@id='g']/a/img[@id='d']/@src")
if dongwu_ele:
if int(try_times_dw) >=5:
pass
else:
try_times_dw =str(int(try_times_dw) +1)
yield Request(url=detail_link,
callback=self.parse_detail,
meta={"product_id": product_id,"record_date": record_date,
"category_id":self.category_id,"platform":'amazon',"rank_type":'category',
'detail_link': detail_link,'proxy_renew':True,'try_times_dw':try_times_dw})
# self.log(price_ele)
price =''
if price_ele :
price_str =''.join([x.strip()for xin price_ele])
i_str = price_str.find('from')
if i_str != -1:
price_str = price_str[i_str+4:]
if '.' in price_stror ',' in price_str:
re_str =r'([\d,\.]+)'
else :
re_str =r'(\d+)'
m = re.search(re_str,price_str)
if m :
price_num = m.group(1)
i_price_num = price_str.find(price_num)
currency = price_str[(i_price_num-2 if i_price_num>1 else i_price_num-1):i_price_num].strip()
if '.' in str(price_num)or ',' in str(price_num):
price = currency +str(price_num)
else:
price = currency +str(float(price_num)/100)
# self.log( price )
if price:
sql ="UPDATE crawl_product_rank SET cpr_price=%s WHERE cpr_unique_id=%s AND cpr_record_date=%s AND cpr_category_id=%s AND cpr_platform=%s and cpr_rank_type=%s"
params = (price,product_id,record_date,category_id,platform,rank_type)
self.log( params )
self.db.query(sql,params)
else:
availability = sel.xpath("//span[@id='pantry-availability']")
if availability:
self.log("Product Is Not Availability : " +str(product_id))
else :
self.log("Get Price Error : " +str(product_id))
#排名
s = response.meta['product_id']
rank_str_ele = response.xpath("//table[contains(@id,'productDetails_detailBullets')]//th[contains(./text(),'Best Sellers Rank')]/following-sibling::td//text()")
if not rank_str_ele:
rank_str_ele = response.xpath("//li[@id='SalesRank']//text()")
rank_list = rank_str_ele.extract()
rank_str =''
for xin rank_list:
x = x.replace('\n','')
if x.strip() =='':
continue
else:
rank_str += x
rank_list_2 = rank_str.split('#')
product_rank_info = []
for xin rank_list_2:
item = {}
m = re.search('^(\d+)\s?in\s?([^\(]+)', x)
if mand m.group(1):
item['rank'] = m.group(1)
item['category'] = m.group(2).replace(' > ','|')
product_rank_info.append(item)
if product_rank_info:
self.log(product_rank_info)
header = {'Content-Type':'application/json'}
yield Request(url="https://amzscout.net/estimator/v2/sales/COM/" + s,headers=header,
body=json.dumps(product_rank_info),callback=self.get_rank,method="POST",
meta={"product_id": product_id,"record_date": record_date,"category_id":self.category_id,
"platform":'amazon',"rank_type":'category',"product_rank_info":product_rank_info,
'try_times':'0'}
)
#排名
def get_rank(self, response):
product_id = response.meta['product_id']
record_date = response.meta['record_date']
category_id = response.meta['category_id']
platform = response.meta['platform']
rank_type = response.meta['rank_type']
product_rank_info = response.meta['product_rank_info']
try_times = response.meta['try_times']
data_all = response.body
rank_all =str(data_all)[2:][:-1]
thirty_all = json.loads(rank_all)
thirty_all_rank = thirty_all['rankHistory']
if thirty_all_rank:
last_rank = json.dumps(thirty_all_rank)
sql ="UPDATE crawl_product_rank SET cpr_rank_thirty=%s WHERE cpr_unique_id=%s AND cpr_record_date=%s AND cpr_category_id=%s AND cpr_platform=%s and cpr_rank_type=%s"
params = (last_rank, product_id, record_date, category_id, platform, rank_type)
self.log(params)
self.db.query(sql, params)
else:
if int(try_times) >=5:
pass
else:
try_times =str(int(try_times) +1)
time.sleep(5)
header = {'Content-Type':'application/json'}
yield Request(url="https://amzscout.net/estimator/v2/sales/COM/" + product_id,headers=header,
body=json.dumps(product_rank_info),callback=self.get_rank,method="POST",dont_filter=True,
meta={"product_id": product_id,"record_date": record_date,"category_id":self.category_id,
"platform":'amazon',"rank_type":'category',"product_rank_info":product_rank_info,
'try_times':try_times}
)