import requests
from lxmlimport etree
import pandasas pd
import time
from pandasimport DataFrame,Series
headers = {
'Cookie':'************************************************',
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36',
'Connection':'keep-alive'
}
info_list = []
def get_url(url):
res = requests.get(url,headers = headers)
selector = etree.HTML(res.text)
hrefs = selector.xpath('//*[@id="classfy"]/a/@href')
for hrefin hrefs:
print(href)
for iin range(30):
new_href = href +'p' +str(i +1)
#print(i+1,new_href)
get_href(new_href)
def get_href(new_href):
html = requests.get(new_href, headers=headers)
selector_2 = etree.HTML(html.text)
htmls = selector_2.xpath('//*[@id="shop-all-list"]/ul/li')
#print(htmls)
# //*[@id="shop-all-list"]/ul/li[1]/div[2]/div[1]/a[1]/h4
for html_3in htmls:
prices = html_3.xpath('div[2]/div[2]/a[2]/b/text()')
shangqu = html_3.xpath('div[2]/div[3]/a[2]/span/text()')
pinglun = html_3.xpath('div[2]/div[2]/a[1]/b/text()')
kouwei = html_3.xpath('div[2]/span/span[1]/b/text()')
huanjing = html_3.xpath('div[2]/span/span[2]/b/text()')
fuwu = html_3.xpath('div[2]/span/span[3]/b/text()')
info = {
'店名': html_3.xpath('div[2]/div[1]/a[1]/h4/text()')[0],
'星级': html_3.xpath('div[2]/div[2]/span/@title')[0],
'评论数': pinglun[0]if len(pinglun) !=0 else " ",
'均价': prices[0]if len(prices) !=0 else " ",
'类型': html_3.xpath('div[2]/div[3]/a[1]/span/text()')[0],
'商区': shangqu[0]if len(shangqu) !=0 else " ",
'地址': html_3.xpath('div[2]/div[3]/span/text()')[0],
'口味': kouwei[0]if len(kouwei) !=0 else " ",
'环境': huanjing[0]if len(huanjing) !=0 else " ",
'服务': fuwu[0]if len(fuwu) !=0 else " "
}
info_list.append(info)
time.sleep(3)
if __name__ =='__main__':
url ='http://www.dianping.com/shenzhen/ch10'
get_url(url)
data = pd.DataFrame(info_list,columns=['店名', "星级", "评论数", "均价", "类型", "商区","地址", "口味", "环境", "服务"])
print(data)
data.to_csv(r'C:\Users\Administrator\Desktop\大众点评.csv', header=True, index=False, mode='a+', encoding='gb18030')