一.爬虫目的
爬取webservice上提供的json格式的气象数据,解析后存入SQLServer数据库。
二.开发环境
Python2.7+Spyder+SQLServer2008
三.注意事项
1.webservice所提供的气象数据是定时刷新的,所以要制作定时器,定时爬取
2.在模拟登陆验证身份信息的时候,要注意把自己的sessionID带上
四.定时爬虫程序代码
# -*- coding: utf-8 -*-
"""
Created on Sat Nov 04 22:30:37 2017
@author: Administrator
"""
import requests
import json
from apscheduler.schedulers.blocking import BlockingScheduler
from apscheduler.executors.pool import ThreadPoolExecutor, ProcessPoolExecutor
import pymssql
class MSSQL:
#初始化参数
def __init__(self,host='*****',user='***',pwd='****',db='****'):
self.host = host
self.user = user
self.pwd = pwd
self.db = db
def connect(self):
self.conn=pymssql.connect(host=self.host,user=self.user,password=self.pwd,database=self.db)
cur = self.conn.cursor()
if not cur:
raise(NameError,"连接数据库失败")
else:
return cur
def login(self):
url = 'http://www.citygrid.net.cn/sensor/api/ApiLogin/';
payload = {"userid": "gttdzzzx", "pwd":"guotu123"}
r = requests.get(url, params=payload)
print r.text
def accept(self):
results = requests.Session().get("http://www.citygrid.net.cn/sensor/api/GetApiLatestvaluebydevicecode/?devicecode=*******&format=json";,
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0',
"Host": "www.citygrid.net.cn";,
"Referer": "http://www.citygrid.net.cn/sensor/api/GetApiLatestvaluebydevicecode";,
"Cookie":"sessionid=******"})
print results.text
datalist = []
t = json.loads(results.text)
datalist.append(t)
for ele in datalist:
print ele
sql = "insert into [dbo].[wea] values ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s');" %(ele[u'id'],ele[u'Time'],ele[u'Version'],ele[u'Status'],ele[u'Battery'],ele[u'Temperature'],ele[u'Humidity'],ele[u'Light'],ele[u'Noise'],ele[u'Pressure'],ele[u'PM2p5'],ele[u'PM10'],ele[u'WindSpeed'],ele[u'WindDirection'],ele[u'CO2'],ele[u'CO'],ele[u'VOC'],ele[u'SO2'],ele[u'NO2'],ele[u'O3'],ele[u'HumanFlow'],ele[u'HumanFlow2'],ele[u'HumanFlow3'],ele[u'HumanFlow4'],ele[u'CarFlow'],ele[u'CarFlow2'],ele[u'CarFlow3'],ele[u'CarFlow4'],ele[u'CarSpeed'],ele[u'CarSpeed2'],ele[u'CarSpeed3'],ele[u'CarSpeed4'],ele[u'GPSLongDeg'],ele[u'GPSLongMin'],ele[u'GPSLongSec'],ele[u'GPSLatDeg'],ele[u'GPSLatMin'],ele[u'GPSLatSec'],ele[u'UV'],ele[u'HCHO'])
print sql
con = self.connect()
con.execute(sql)
self.conn.commit()
self.conn.close()
def main():
obj = MSSQL(host='****8',user='***',pwd='***',db='*****')
obj.login()
obj.connect()
obj.accept()
sched = BlockingScheduler()
sched.add_job(obj.accept, 'interval', seconds=60)
sched.start()
if __name__ == '__main__':
main()