frombs4importBeautifulSoup
importrequests
importtime
info=[]
headers={
'User-Agent':'Mozilla/5.0 (iPhone; CPU iPhone OS 9_1 like Mac OS X) AppleWebKit/601.1.46 (KHTML, like Gecko) Version/9.0 Mobile/13B143 Safari/601.1'
}
urls=['http://www.tripadvisor.cn/Attractions-g60763-Activities-oa{}-New_York_City_New_York.html#ATTRACTION_LIST'.format(i)foriinrange(0,976,20)]
deftripying(url):
wb_data=requests.get(url,headers=headers)
time.sleep(2)
soup=BeautifulSoup(wb_data.text,'lxml')
titles=soup.select(' div.title.titleLLR > div')
images=soup.find_all('div','missing lazyMiss')
rates=soup.select('div.overflowEllipsis.popIndexLLR > div')
cates=soup.select('div.attraction_types > span')
comments=soup.find_all(class_='reviewCountLLR overflowEllipsis')
stars=soup.select(' div.rating.ratingLLR.overflowEllipsis > div.list_item_rating > div > div > div.center')
defcommentstar(values):
data_in=values
countclass=0
countstar=0
suanstar=[]
foriindata_in:
countclass=countclass+1
job=i.get('class')
ifjob[1]=='':
countstar=countstar+1
elifjob[1]=='semi_full':
countstar=countstar+0.5
else:
countstar=countstar+0
ifcountclass==5:
data_in=data_in[:5]
countclass=0
suanstar.append(countstar)
countstar=0
return(suanstar)
stars1=commentstar(stars)
fortitle, image, rate, cate, comment, starinzip(titles, images, rates, cates, comments, stars1):
data={
'title':title.get_text(),
'image':image.get('data-thumburl'),
'rate':rate.get_text(),
'cate':cate.get_text(),
'comment':comment.get_text(),
'star':star
}
info.append(data)
foriininfo:
print(i['title'], i['image'], i['rate'], i['cate'], i['comment'],'评级:'+str(i['star'])+'星')
forubinurls:
tripying(ub)
离我改变世界又近了一步
通过这个作业,我学习了如何将浏览器伪装成为手机,如何全面的爬取一些数据
但是也还有一些问题,就是爬出来的数据会重复很多次,希望有老师能够解答这个问题及原因
又学习了format和rang的用法。
有些数据爬出来还是很困难