通过正则表达式解析数据
from urllib import request
from xpinyin import Pinyin
import re
def get_aqi(city_pinyin):
print('get_aqi(\'%s\')' % city_pinyin)
result = {
'city': '', # 城市名称
'aqi': 0, # AQI指数值
'level': '', # AQI指数等级
'pm25': 0, # PM2.5浓度(微克/立方米 μg/m³)
'weather': '', # 天气
'source': '' # 数据来源信息
}
res = request.urlopen('http://www.pm25.com/' + city_pinyin + '.html')
page = res.read() # 获取网页字节码
print('page_size: %d bytes' % len(page))
page = page.decode('utf-8') # 解码
search_obj = re.search(r'<h2 class="bi_loaction_city">(.+?)</h2>', page)
if search_obj:
result['city'] = search_obj.group(1)
search_obj = re.search(r'<a class="bi_aqiarea_num">(\d+)</a>', page)
if search_obj:
result['aqi'] = int(search_obj.group(1))
search_obj = re.search(r'<span class="bi_aqiarea_wuran.*?">(.+?)</span>', page)
if search_obj:
result['level'] = search_obj.group(1)
search_obj = re.search(r'<span class="pm25_span">(\d+)</span>', page)
if search_obj:
result['pm25'] = int(search_obj.group(1))
search_obj = re.search(r'<p class="bi_info_weather">.+?<span>(.+?)</span>(.+?)</p>', page)
if search_obj:
result['weather'] = search_obj.group(1) + search_obj.group(2)
search_obj = re.search(r'<p class="bi_info_tips">数据来源:(.+?)(?: )*最后更新:(.+?)</p>', page)
if search_obj:
result['source'] = search_obj.group(1) + ' ' + search_obj.group(2)
return result
def get_pinyin(city):
print('get_pinyin(\'%s\')' % city)
p = Pinyin()
return p.get_pinyin(city, '')
if __name__ == '__main__':
print('''=== 城市AQI指数查询
===== 数据来自 http://www.pm25.com''')
city = input('目标城市(如“成都”或“chengdu”):')
info = get_aqi(get_pinyin(city))
print(info)
通过 BeautifulSoup 解析数据
from urllib import request
from xpinyin import Pinyin
from bs4 import BeautifulSoup
def get_aqi(city_pinyin):
print('get_aqi(\'%s\')' % city_pinyin)
result = {
'city': '', # 城市名称
'aqi': 0, # AQI指数值
'level': '', # AQI指数等级
'pm25': 0, # PM2.5浓度(微克/立方米 μg/m³)
'weather': '', # 天气
'source': '' # 数据来源信息
}
res = request.urlopen('http://www.pm25.com/' + city_pinyin + '.html')
page = res.read() # 获取网页字节码
print('page_size: %d bytes' % len(page))
page = page.decode('utf-8') # 解码
soup = BeautifulSoup(page)
result['city'] = soup.find(class_='bi_loaction_city').text
result['aqi'] = int(soup.find('a', class_='bi_aqiarea_num').text)
result['level'] = soup.find('span', {'class': 'bi_aqiarea_wuran'}).text
result['pm25'] = int(soup.select('.bi_aqiarea_bottom span')[0].text)
result['weather'] = soup.find('p', class_='bi_info_weather').text.strip()
result['source'] = soup.find('p', class_='bi_info_tips').text
return result
def get_pinyin(city):
print('get_pinyin(\'%s\')' % city)
p = Pinyin()
return p.get_pinyin(city, '')
if __name__ == '__main__':
print('''=== 城市AQI指数查询
===== 数据来自 http://www.pm25.com''')
city = input('目标城市(如“成都”或“chengdu”):')
info = get_aqi(get_pinyin(city))
print(info)