import requests
from bs4 import BeautifulSoup
from pyecharts import Bar
all_data = []
def parse_page(url):
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
response = requests.get(url,headers = headers)
text = response.content.decode('utf-8')
soup = BeautifulSoup(text,'html5lib') # 由于港澳台的html代码不完整,'html5lib'解析器容错率强,比lxml好,但速度比较慢
today = soup.find('div',class_="conMidtab")
tables = today.find_all('table') # 省/直辖市
for table in tables:
trs = table.find_all('tr')[2:] # 第三个tr标签开始
for index,tr in enumerate(trs): # 每行表格
tds = tr.find_all('td')
if index == 0: # tr标签下面第一行的index为0,那么就是开头第一个城市,开头第一个城市td标签与第二个不一样
city_td = tds[1]
else:
city_td = tds[0]
city = list(city_td.stripped_strings)[0]
weather = tds[-2]
low_weather = list(weather.stripped_strings)[0]
all_data.append({'城市':city,'最低气温':int(low_weather)})
def spider():
urls = {
'http://www.weather.com.cn/textFC/hb.shtml',
'http://www.weather.com.cn/textFC/db.shtml',
'http://www.weather.com.cn/textFC/hd.shtml',
'http://www.weather.com.cn/textFC/hz.shtml',
'http://www.weather.com.cn/textFC/hn.shtml',
'http://www.weather.com.cn/textFC/xb.shtml',
'http://www.weather.com.cn/textFC/xn.shtml',
'http://www.weather.com.cn/textFC/gat.shtml'
}
for url in urls:
parse_page(url)
'''
all_data = [
{'城市': '宿迁', '最低气温': '2'},
{'城市': '济南', '最低气温': '2'},
{'城市': '青岛', '最低气温': '1'},
{'城市': '淄博', '最低气温': '-2'},
{'城市': '德州', '最低气温': '-1'},
{'城市': '烟台', '最低气温': '-2'}
]
def sorr_key(data):
weather = data['最低气温'] # all_data['最低气温']
return weather
all_data.sort(key=sorr_key)
'''
all_data.sort(key=lambda data:data['最低气温'])
data = all_data[0:10]
cities = list(map(lambda x:x['城市'],data))
weather = list(map(lambda x:x['最低气温'],data))
chart = Bar("中国天气最低温排行榜") # 柱形图
chart.add('',cities,weather)
chart.render('temperature.html')
spider()
ps:这次爬虫需要注意的是lxml解释器不一定很好用,html5lib虽然兼容性高,但是速度慢,map函数以及sort和lambda函数的用法