最近需要从网上下载空气质量数据,看到真气网的历史数据整理的不错,因此想利用脚本下载;查看网站之后发现通过数据流隐藏了数据,又懒得通过抓包分析,于是使用selenium来模拟浏览器进行下载。具体代码如下:
ChromeDriver
历史空气质量数据
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date : 2019-09-14 22:22:06
# @Author : Your Name (you@example.org)
# @Link : http://example.org
# @Version : $Id$
import os
import time
import datetime
from urllib.parse import urlencode
import requests
import pandas as pd
from selenium import webdriver
# https://www.aqistudy.cn/historydata/daydata.php?city=%E5%8C%97%E4%BA%AC&month=2016-11
url_base = 'https://www.aqistudy.cn/historydata/daydata.php'
citys = [
'北京',
'天津',
'石家庄',
'保定',
'唐山',
'秦皇岛',
'邢台',
'邯郸',
'沧州',
]
sdt = datetime.datetime(2013, 12, 1) # 开始时间
edt = datetime.datetime.now() # 结束时间 datetime.datetime(2013, 12, 1)
months = pd.date_range(sdt, edt, freq='1m')
homedir = os.path.dirname(os.path.realpath(__file__))
output_path = os.path.join(homedir, 'data')
print(homedir)
print(output_path)
if not os.path.exists(output_path):
os.makedirs(output_path, exist_ok=True)
# open selenium, download from http://npm.taobao.org/mirrors/chromedriver/
# 下载后放在脚本所在目录
driver = webdriver.Chrome()
for icity, vcity in enumerate(citys):
output_filename = os.path.join(output_path, '{}_{}_{}.csv'.format(
vcity, sdt.strftime('%Y%m'), edt.strftime('%Y%m')))
for imonth, vmonth in enumerate(months):
url = '{}?{}&month={}'.format(
url_base,
urlencode({'city':vcity}, 'utf-8'),
vmonth.strftime('%Y-%m')
)
driver.get(url)
time.sleep(5)
data = pd.read_html(driver.page_source, header=False)[0]
# output data
if os.path.exists(output_filename):
headers=False
mode='a'
else:
headers=True
mode='w'
data.to_csv(output_filename, index=False, header=headers, mode=mode)
print(vcity, vmonth, url)
time.sleep(2.5)
其他问题自行解决!