1、时间格式转换(2014->20140101->20141231)
2、日期数据与其他日期数据合并(pd.merge)
3、计算相关性以及p值
# Python
# read in emission
import datetime
from datetime import timedelta
ems=pd.read_csv('./csv/emission.csv')
ems['ds']= pd.to_datetime(ems['Year'],format='%Y')
year=(ems['ds'].dt.year).values
# method 1
for i,num in enumerate(year):
ems['ds'][i]=datetime.datetime(num, 12, 31)
print(ems['ds'])
# method 2
#lambda x:x*i for i in range(4)]
te=[datetime.datetime(x, 12, 31) for x in year]
ems['hi']=te
print(ems['hi'])
ems=ems.set_index('ds')
sumey=pd.merge(sumey,ems['PM_per'],on='ds',how='outer',sort=True)
sumey=pd.merge(sumey,ems['NO_per'],on='ds',how='outer',sort=True)
sumey=pd.merge(sumey,ems['SO_per'],on='ds',how='outer',sort=True)
print(type(sumey))
print(sumey)
# plot
for i,num in enumerate(sumey.columns):
print('i=',i)
x =sumey[num]
for j in range(i+1,len(sumey.columns)):
y=sumey[sumey.columns[j]]
print('x=',num,'y=',sumey.columns[j])
bad = ~np.logical_or(np.isnan(x), np.isnan(y))
x1=np.compress(bad, x)
x2=np.compress(bad, y)
r,p = stats.pearsonr(x1,x2)
print('r = %6.5f,p = %6.5f'%(r,p))
print(sumey.corr())
sumey.corr().to_csv('./csv/correlation.csv')
fig = pd.plotting.scatter_matrix(sumey,figsize=(6,6),c ='blue',marker = 'o',diagonal='',alpha = 0.8,range_padding=0.2)
#plt.show()