import pandas as pd
ebola = pd.read_csv('data/country_timeseries.csv', parse_dates=[0])
print(ebola.iloc[:5, :5])
'''
Date Day Cases_Guinea Cases_Liberia Cases_SierraLeone
0 2015-01-05 289 2776.0 NaN 10030.0
1 2015-01-04 288 2775.0 NaN 9780.0
2 2015-01-03 287 2769.0 8166.0 9722.0
3 2015-01-02 286 NaN 8157.0 NaN
4 2014-12-31 284 2730.0 8115.0 9633.0
'''
基于日期数据获取子集
print(ebola.loc[(ebola.Date.dt.year == 2014) & (ebola.Date.dt.month == 6)].iloc[:,:5])
'''
Date Day Cases_Guinea Cases_Liberia Cases_SierraLeone
79 2014-06-30 100 413.0 107.0 239.0
80 2014-06-22 92 NaN 51.0 NaN
81 2014-06-20 90 390.0 NaN 158.0
82 2014-06-19 89 NaN 41.0 NaN
83 2014-06-18 88 390.0 NaN 136.0
84 2014-06-17 87 NaN NaN 97.0
85 2014-06-16 86 398.0 33.0 NaN
86 2014-06-10 80 351.0 13.0 89.0
87 2014-06-05 75 NaN 13.0 81.0
88 2014-06-03 73 344.0 13.0 NaN
89 2014-06-01 71 328.0 13.0 79.0
'''
DatetimeIndex 对象
处理包含datetime的数据时,经常把datetime对象设置成DataFrame的索引
ebola.index = ebola['Date']
print(ebola.index)
'''
DatetimeIndex(['2015-01-05', '2015-01-04', '2015-01-03', '2015-01-02',
'2014-12-31', '2014-12-28', '2014-12-27', '2014-12-24',
'2014-12-21', '2014-12-20',
...
'2014-04-04', '2014-04-01', '2014-03-31', '2014-03-29',
'2014-03-28', '2014-03-27', '2014-03-26', '2014-03-25',
'2014-03-24', '2014-03-22'],
dtype='datetime64[ns]', name='Date', length=122, freq=None)
'''
# 指定年份抽取行数据
print(ebola['2015'].iloc[:,:5])
'''
Date Day Cases_Guinea Cases_Liberia Cases_SierraLeone
Date
2015-01-05 2015-01-05 289 2776.0 NaN 10030.0
2015-01-04 2015-01-04 288 2775.0 NaN 9780.0
2015-01-03 2015-01-03 287 2769.0 8166.0 9722.0
2015-01-02 2015-01-02 286 NaN 8157.0 NaN
'''
# 指定年份月份抽取数据
print(ebola['2014-06'].iloc[:,:5])
'''
Date Day Cases_Guinea Cases_Liberia Cases_SierraLeone
Date
2014-06-30 2014-06-30 100 413.0 107.0 239.0
2014-06-22 2014-06-22 92 NaN 51.0 NaN
2014-06-20 2014-06-20 90 390.0 NaN 158.0
2014-06-19 2014-06-19 89 NaN 41.0 NaN
2014-06-18 2014-06-18 88 390.0 NaN 136.0
2014-06-17 2014-06-17 87 NaN NaN 97.0
2014-06-16 2014-06-16 86 398.0 33.0 NaN
2014-06-10 2014-06-10 80 351.0 13.0 89.0
2014-06-05 2014-06-05 75 NaN 13.0 81.0
2014-06-03 2014-06-03 73 344.0 13.0 NaN
2014-06-01 2014-06-01 71 328.0 13.0 79.0
'''
TimedeltaIndex 对象
用日期运算的结果作为index后,可以直接用TimedeltaIndex对象作为索引,但是必须要注意index顺序,从上到下。
ebola['outbreak_d'] = ebola['Date'] - ebola['Date'].min()
ebola.index = ebola['outbreak_d']
print(ebola.index)
'''
TimedeltaIndex(['289 days', '288 days', '287 days', '286 days', '284 days',
'281 days', '280 days', '277 days', '274 days', '273 days',
...
'13 days', '10 days', '9 days', '7 days', '6 days',
'5 days', '4 days', '3 days', '2 days', '0 days'],
dtype='timedelta64[ns]', name='outbreak_d', length=122, freq=None)
'''
print(ebola.iloc[:5, :5])
'''
Date Day Cases_Guinea Cases_Liberia Cases_SierraLeone
outbreak_d
289 days 2015-01-05 289 2776.0 NaN 10030.0
288 days 2015-01-04 288 2775.0 NaN 9780.0
287 days 2015-01-03 287 2769.0 8166.0 9722.0
286 days 2015-01-02 286 NaN 8157.0 NaN
284 days 2014-12-31 284 2730.0 8115.0 9633.0
'''
print(ebola['289 days': '280 days'].iloc[:, :5])
'''
Date Day Cases_Guinea Cases_Liberia Cases_SierraLeone
outbreak_d
289 days 2015-01-05 289 2776.0 NaN 10030.0
288 days 2015-01-04 288 2775.0 NaN 9780.0
287 days 2015-01-03 287 2769.0 8166.0 9722.0
286 days 2015-01-02 286 NaN 8157.0 NaN
284 days 2014-12-31 284 2730.0 8115.0 9633.0
281 days 2014-12-28 281 2706.0 8018.0 9446.0
'''
# 索引顺序错误
print(ebola['280 days': '289 days'].iloc[:, :5])
'''
Empty DataFrame
Columns: [Date, Day, Cases_Guinea, Cases_Liberia, Cases_SierraLeone]
Index: []
'''
日期范围
2015-01-01和2014-03-23的数据是缺失的
ebola = pd.read_csv('data/country_timeseries.csv', parse_dates=[0])
print(ebola.iloc[:5, :5])
'''
Date Day Cases_Guinea Cases_Liberia Cases_SierraLeone
0 2015-01-05 289 2776.0 NaN 10030.0
1 2015-01-04 288 2775.0 NaN 9780.0
2 2015-01-03 287 2769.0 8166.0 9722.0
3 2015-01-02 286 NaN 8157.0 NaN
4 2014-12-31 284 2730.0 8115.0 9633.0
'''
print(ebola.iloc[-5:, :5])
'''
Date Day Cases_Guinea Cases_Liberia Cases_SierraLeone
117 2014-03-27 5 103.0 8.0 6.0
118 2014-03-26 4 86.0 NaN NaN
119 2014-03-25 3 86.0 NaN NaN
120 2014-03-24 2 86.0 NaN NaN
121 2014-03-22 0 49.0 NaN NaN
'''
创建一个日期范围来为数据集重建索引
head_range = pd.date_range(start='2014-12-31', end='2015-01-05')
print(head_range)
'''
DatetimeIndex(['2014-12-31', '2015-01-01', '2015-01-02', '2015-01-03',
'2015-01-04', '2015-01-05'],
dtype='datetime64[ns]', freq='D')
'''
在这个例子中,只取前5行数据,想把head_range设置为ebola_5的索引,需要先把日期设置为ebola_5的索引,然后为数据重建索引
ebola_5 = ebola.head()
ebola_5.index = ebola_5['Date']
ebola_5.reindex(head_range)
print(ebola_5.iloc[:, :5])
'''
Date Day Cases_Guinea Cases_Liberia Cases_SierraLeone
Date
2015-01-05 2015-01-05 289 2776.0 NaN 10030.0
2015-01-04 2015-01-04 288 2775.0 NaN 9780.0
2015-01-03 2015-01-03 287 2769.0 8166.0 9722.0
2015-01-02 2015-01-02 286 NaN 8157.0 NaN
2014-12-31 2014-12-31 284 2730.0 8115.0 9633.0
'''
频率
在head_range函数中有一个参数freq,其默认值为D(代表day),表示日期范围内的值是逐日递增的。
print(pd.date_range('2017-01-01', '2017-01-07', freq='B'))
'''
DatetimeIndex(['2017-01-02', '2017-01-03', '2017-01-04', '2017-01-05',
'2017-01-06'],
dtype='datetime64[ns]', freq='B')
'''
偏移量
偏移量是在基本频率上做的一点调整,例如可以向刚刚创建的工作日范围添加一个偏移量,这样就可以隔一个工作日取一个工作日。
在基本频率前加一个倍数值就创建出了该偏移量
# 从2017年1月1日这周隔一天取一个工作日
print(pd.date_range('2017-01-01', '2017-01-07', freq='2B'))
# DatetimeIndex(['2017-01-02', '2017-01-04', '2017-01-06'], dtype='datetime64[ns]', freq='2B')
偏移量可以和其他基本频率结合使用
# 每月的第一个星期五
print(pd.date_range('2017-01-01', '2017-12-31', freq='WOM-1THU'))
'''
DatetimeIndex(['2017-01-05', '2017-02-02', '2017-03-02', '2017-04-06',
'2017-05-04', '2017-06-01', '2017-07-06', '2017-08-03',
'2017-09-07', '2017-10-05', '2017-11-02', '2017-12-07'],
dtype='datetime64[ns]', freq='WOM-1THU')
'''
移动
有时需要更改数据的日期,例如修正数据中的某个测量误差,或者对数据的开始日期进行标准化,以便比较趋势。
比如需要比较不同国家的疫情传播速度,但是不同国家爆发疫情的时间不同,很难比较各国疫情的爆发情况。
ebola_sub = ebola[['Day', 'Cases_Guinea', 'Cases_Liberia']]
print(ebola_sub.tail(10))
'''
Day Cases_Guinea Cases_Liberia
112 13 143.0 18.0
113 10 127.0 8.0
114 9 122.0 8.0
115 7 112.0 7.0
116 6 112.0 3.0
117 5 103.0 8.0
118 4 86.0 NaN
119 3 86.0 NaN
120 2 86.0 NaN
121 0 49.0 NaN
'''
最好所有的日期都从常用的0天开始。
(1)由于有些日期没有列出来,所以需要为数据集的所有日期创建一个日期范围。
(2)需要计算数据集中最早日期和每列最早有效日期(非NaN)之间的插值。
(3)然后根据计算结果移动每列。
开始之前,首先读取ebola数据集的一个副本。同时把Date解析为date对象,并把日期指派给index。本例中会解析日期并直接设置为索引。
ebola = pd.read_csv('data/country_timeseries.csv', index_col='Date', parse_dates=['Date'])
print(ebola.head().iloc[:, :4])
'''
Day Cases_Guinea Cases_Liberia Cases_SierraLeone
Date
2015-01-05 289 2776.0 NaN 10030.0
2015-01-04 288 2775.0 NaN 9780.0
2015-01-03 287 2769.0 8166.0 9722.0
2015-01-02 286 NaN 8157.0 NaN
2014-12-31 284 2730.0 8115.0 9633.0
'''
new_idx = pd.date_range(ebola.index.min(), ebola.index.max())
print(new_idx)
'''
DatetimeIndex(['2014-03-22', '2014-03-23', '2014-03-24', '2014-03-25',
'2014-03-26', '2014-03-27', '2014-03-28', '2014-03-29',
'2014-03-30', '2014-03-31',
...
'2014-12-27', '2014-12-28', '2014-12-29', '2014-12-30',
'2014-12-31', '2015-01-01', '2015-01-02', '2015-01-03',
'2015-01-04', '2015-01-05'],
dtype='datetime64[ns]', length=290, freq='D')
'''
new_idx = reversed(new_idx)
ebola = ebola.reindex(new_idx)
print(ebola.head().iloc[:, :4])
'''
Day Cases_Guinea Cases_Liberia Cases_SierraLeone
Date
2015-01-05 289.0 2776.0 NaN 10030.0
2015-01-04 288.0 2775.0 NaN 9780.0
2015-01-03 287.0 2769.0 8166.0 9722.0
2015-01-02 286.0 NaN 8157.0 NaN
2015-01-01 NaN NaN NaN NaN
'''
# 每列最早有效日期,Series的last_valid_index方法返回最后一个非缺失值或非空值的索引值
# 类似的还有first_valid_index(返回第一个非缺失值或非空值的索引值)
last_valid = ebola.apply(pd.Series.last_valid_index)
print(last_valid)
'''
Day 2014-03-22
Cases_Guinea 2014-03-22
Cases_Liberia 2014-03-27
Cases_SierraLeone 2014-03-27
Cases_Nigeria 2014-07-23
Cases_Senegal 2014-08-31
Cases_UnitedStates 2014-10-01
Cases_Spain 2014-10-08
Cases_Mali 2014-10-22
Deaths_Guinea 2014-03-22
Deaths_Liberia 2014-03-27
Deaths_SierraLeone 2014-03-27
Deaths_Nigeria 2014-07-23
Deaths_Senegal 2014-09-07
Deaths_UnitedStates 2014-10-01
Deaths_Spain 2014-10-08
Deaths_Mali 2014-10-22
dtype: datetime64[ns]
'''
# 获取数据中最早的日期
earliest_date = ebola.index.min()
print(earliest_date)
# 2014-03-22 00:00:00
# 计算最早日期和每列最早有效期日的差值
shift_values = last_valid - earliest_date
print(shift_values)
'''
Day 0 days
Cases_Guinea 0 days
Cases_Liberia 5 days
Cases_SierraLeone 5 days
Cases_Nigeria 123 days
Cases_Senegal 162 days
Cases_UnitedStates 193 days
Cases_Spain 200 days
Cases_Mali 214 days
Deaths_Guinea 0 days
Deaths_Liberia 5 days
Deaths_SierraLeone 5 days
Deaths_Nigeria 123 days
Deaths_Senegal 169 days
Deaths_UnitedStates 193 days
Deaths_Spain 200 days
Deaths_Mali 214 days
dtype: timedelta64[ns]
'''
# 历遍各乐趣,根据shift_values中相应的值使用shift方法把列下移。(shift_values中的数字都是正数,若是负数,会把值上移)
ebola_dict = {}
for idx, col in enumerate(ebola):
d = shift_values[idx].days
shifted = ebola[col].shift(d)
ebola_dict[col] = shifted
ebola_shift = pd.DataFrame(ebola_dict)
# dict是无序的,传入原来的ebola的列来重新排列
ebola_shift = ebola_shift[ebola.columns]
# 每列的最后一行都有值
print(ebola_shift.tail())
'''
Day Cases_Guinea Cases_Liberia Cases_SierraLeone \
Date
2014-03-26 4.0 86.0 8.0 2.0
2014-03-25 3.0 86.0 NaN NaN
2014-03-24 2.0 86.0 7.0 NaN
2014-03-23 NaN NaN 3.0 2.0
2014-03-22 0.0 49.0 8.0 6.0
Cases_Nigeria Cases_Senegal Cases_UnitedStates Cases_Spain \
Date
2014-03-26 1.0 NaN 1.0 1.0
2014-03-25 NaN NaN NaN NaN
2014-03-24 NaN NaN NaN NaN
2014-03-23 NaN NaN NaN NaN
2014-03-22 0.0 1.0 1.0 1.0
Cases_Mali Deaths_Guinea Deaths_Liberia Deaths_SierraLeone \
Date
2014-03-26 NaN 62.0 4.0 2.0
2014-03-25 NaN 60.0 NaN NaN
2014-03-24 NaN 59.0 2.0 NaN
2014-03-23 NaN NaN 3.0 2.0
2014-03-22 1.0 29.0 6.0 5.0
Deaths_Nigeria Deaths_Senegal Deaths_UnitedStates Deaths_Spain \
Date
2014-03-26 1.0 NaN 0.0 1.0
2014-03-25 NaN NaN NaN NaN
2014-03-24 NaN NaN NaN NaN
2014-03-23 NaN NaN NaN NaN
2014-03-22 0.0 0.0 0.0 1.0
Deaths_Mali
Date
2014-03-26 NaN
2014-03-25 NaN
2014-03-24 NaN
2014-03-23 NaN
2014-03-22 1.0
'''
每一行的索引已经失效,可以将其删除,然后指定正确的列,即日期。Day不再表示日期爆发的第一天,而是指特定国家疫情爆发的第一天
ebola_shift.index = ebola_shift['Day']
ebola_shift = ebola_shift.drop(['Day'], axis=1)
print(ebola_shift.tail())
'''
Cases_Guinea Cases_Liberia Cases_SierraLeone Cases_Nigeria \
Day
4.0 86.0 8.0 2.0 1.0
3.0 86.0 NaN NaN NaN
2.0 86.0 7.0 NaN NaN
NaN NaN 3.0 2.0 NaN
0.0 49.0 8.0 6.0 0.0
Cases_Senegal Cases_UnitedStates Cases_Spain Cases_Mali \
Day
4.0 NaN 1.0 1.0 NaN
3.0 NaN NaN NaN NaN
2.0 NaN NaN NaN NaN
NaN NaN NaN NaN NaN
0.0 1.0 1.0 1.0 1.0
Deaths_Guinea Deaths_Liberia Deaths_SierraLeone Deaths_Nigeria \
Day
4.0 62.0 4.0 2.0 1.0
3.0 60.0 NaN NaN NaN
2.0 59.0 2.0 NaN NaN
NaN NaN 3.0 2.0 NaN
0.0 29.0 6.0 5.0 0.0
Deaths_Senegal Deaths_UnitedStates Deaths_Spain Deaths_Mali
Day
4.0 NaN 0.0 1.0 NaN
3.0 NaN NaN NaN NaN
2.0 NaN NaN NaN NaN
NaN NaN NaN NaN NaN
0.0 0.0 0.0 1.0 1.0
'''
重采样
- 下采样:从高频率到低频率(比如从每天到每月)
- 上采样:从低频率到高频率(比如从每月到每天)
- 原样采样:采样频率不变(比如每月的第一个星期四到每月的最后一个星期五)
resample函数有一个rule参数,用于接收偏移量字符串。
# 下采样:从每天到每月
# 这里有多个值,需要把结果居合起来
down = ebola.resample('M').mean()
print(down.iloc[:5, :5])
'''
Day Cases_Guinea Cases_Liberia Cases_SierraLeone \
Date
2014-03-31 4.500000 94.500000 6.500000 3.333333
2014-04-30 24.333333 177.818182 24.555556 2.200000
2014-05-31 51.888889 248.777778 12.555556 7.333333
2014-06-30 84.636364 373.428571 35.500000 125.571429
2014-07-31 115.700000 423.000000 212.300000 420.500000
Cases_Nigeria
Date
2014-03-31 NaN
2014-04-30 NaN
2014-05-31 NaN
2014-06-30 NaN
2014-07-31 1.333333
'''
# 这里对下采样得到的值进行上采样
# 请注意填充了多少确实日期
# 使用缺失值进行填充
up = down.resample('D').mean()
print(up.iloc[:5, :5])
'''
Day Cases_Guinea Cases_Liberia Cases_SierraLeone Cases_Nigeria
Date
2014-03-31 4.5 94.5 6.5 3.333333 NaN
2014-04-01 NaN NaN NaN NaN NaN
2014-04-02 NaN NaN NaN NaN NaN
2014-04-03 NaN NaN NaN NaN NaN
2014-04-04 NaN NaN NaN NaN NaN
'''
时区
import pytz
import re
# 在pandas中处理时区,最简单的方法是使用pytz.all_timezones给出的字符串名
regex = re.compile(r'^US')
selected_files = filter(regex.search, pytz.common_timezones)
print(list(selected_files))
# ['US/Alaska', 'US/Arizona', 'US/Central', 'US/Eastern', 'US/Hawaii', 'US/Mountain', 'US/Pacific']
# 指定时区
depart = pd.Timestamp('2017-08-29 07:00', tz='US/Eastern')
print(depart)
# 2017-08-29 07:00:00-04:00
# 对时区编码的另一种方法是调用‘空’时间戳的tz_localize方法
arrive = pd.Timestamp('2017-08-29 09:57')
print(arrive)
arrive = arrive.tz_localize('US/Pacific')
print(arrive)
# 2017-08-29 09:57:00-07:00
# 把航班到达时间转换回东部时区
arrive = arrive.tz_convert('US/Eastern')
print(arrive)
# 2017-08-29 12:57:00-04:00
# 对两个时间点计算时间差,之前的版本需要调整成同一个时区才可计算,现在不需要
# duration = arrive.tz_convert('US/Eastern') - depart
duration = arrive - depart
print(duration)
# 0 days 05:57:00