暑假里到设计院实习的时候,每天都要做公交车。每次出发、到公交车站,公交车来、下车、走到目的地时,我都会拍一张照片,从照片的名字里就可以知道拍照的时间,有时候忘记拍了,就是缺失数据。我收集了上下班的行程时间数据,做了个分析。在这个里面用到了 seaborn
这个模块,感觉好简洁好强大。
导入模块
%matplotlib inline
import sys
import numpy as np
import pandas as pd
import datetime as dt
import seaborn as sns
sns.set(context='notebook',
style='whitegrid', # seaborn 有 darkgrid, whitegrid, dark, white, ticks 等主题,我偏爱 whitegrid.
font_scale=1.5,
font='SimHei', # 解决中文输出问题
rc={'axes.unicode_minus':False} # 解决负号无法显示问题
)
预处理数据
从文本文件导入数据
raw_data = []
with open("D:\\Documents\\PythonS\\#bus\\summer_bus.txt",'r',encoding='utf-8') as f:
for line in f:
raw_data.append(line.strip().split(','))
照片名称 "IMG_20160713_071539" 这个样,要处理一下, 我主要关心的是时间间隔,所以把时间转换成时间戳,然后再相减。后面的数据可视化里面时间的单位是分钟,这样比较自然。
num = len(raw_data)
data = np.zeros((num,), dtype=({
'names': ['week', 'moment', 'bus', 'walk_time1', 'wait_time',
'bus_time', 'walk_time2'],
'formats': ['U3','U1', np.uint8, np.uint16, np.uint16,
np.uint16, np.uint16]}))
# 把文本先转换成时间
for record in raw_data:
for i in range(3, 8):
if record[i] is not '':
record[i] = dt.datetime.strptime(record[i],
'IMG_%Y%m%d_%H%M%S').timestamp()
# 把数据转换到numpy数组中去
for line, record in enumerate(raw_data):
data['week'][line] = record[0]
data['bus'][line]= record[2]
data['moment'][line] = record[1]
try:
if record[1] == 'm':
data[line]['walk_time1'] = record[4] - record[3]
data[line]['walk_time2'] = record[7] - record[6]
elif record[1] == 'n':
data[line]['walk_time2'] = record[4] - record[3]
data[line]['walk_time1'] = record[7] - record[6]
else:
print('时刻数据不正确')
data[line]['wait_time'] = record[5] - record[4]
data[line]['bus_time'] = record[6] - record[5]
except TypeError: # 对于缺失值的处理就是先空着为 0
continue
# 据说pandas的DataFrame统计数据很不错,试试看
Data=pd.DataFrame(data)
Data.loc[:,'walk_time1':'walk_time2'] /= 60 # 将秒转换成分钟
缺失值处理
# The function used to count missing data
def count(x):
num = 0
for mem in x:
if mem == 0:
num += 1
return num
miss_count = Data.apply(count)
print("各组数据的缺失值数目为:\n",miss_count)
各组数据的缺失值数目为:
week 0
moment 0
bus 0
walk_time1 2
wait_time 3
bus_time 3
walk_time2 1
dtype: int64
# Replace the missing data with means
Data.replace(to_replace=0, value=Data.mean().to_dict(), inplace=True)
数据可视化分析
各时段用时箱图对比
ax0 = sns.boxplot(data=Data.loc[:,['walk_time1','wait_time','bus_time','walk_time2']])
ax0.yaxis.set_label_text('time/min');
从各个时间段的用时来看,乘公交车用的时间最多,等公交车所用的时间是最少的,但是缺是波动最大的。
总用时
total_time = Data.loc[:,'walk_time1':'walk_time2'].sum(axis=1)
ax1 = sns.distplot(total_time, rug=True,
axlabel='Total travel time /min',
kde_kws={'label': "KDE"})
print("路上平均总用时:{:.2f} mins".format(total_time.mean()))
路上平均总用时:57.58 mins
从直方图上看,在55分钟为中心的地方有一点点正态的感觉,但是数据太少不容易说明问题。
等车时间
ax2 = sns.distplot(Data.loc[:,'wait_time'],bins=9,
rug=True, axlabel='wait_time /min',
kde_kws={'label': "KDE"})
print('等车平均用时{:.2f}mins'.format(Data.loc[:,'wait_time'].mean()))
等车平均用时5.48mins
等车的时间看起来有点像在发车间隔之间的均匀分布,不过我有两种公交车,再加上数据太少,只是看看吧。
车上的时间
ax3 = sns.distplot(Data.loc[:,'bus_time'], bins=8,
rug=True, axlabel='bus_time /min',
kde_kws={'label': "KDE"})
公交车开得太快不可能的,但一定会有开得慢的时候。
走路的时间
不用说了,这应该就是正态分布的,只是偶尔有几次我在路上玩了。
import matplotlib.pyplot as plt
from scipy.stats import norm
sns.set_color_codes() # Map matplotlib color codes to the default seaborn palette.
fig = plt.figure()
fig.set_figwidth(15)
fig.set_figheight(6)
ax4 = fig.add_subplot(121)
ax5 = fig.add_subplot(122)
sns.distplot(Data.loc[:,'walk_time1'],
rug=True, axlabel='walk_time1 /min',
kde_kws={'label': "KDE"}, ax=ax4,
fit = norm,
fit_kws={'label':'norm_fit','color':'r'})
ax4.legend()
ax3 = sns.distplot(Data.loc[:,'walk_time2'],
rug=True, axlabel='bus_time /min',
fit = norm,
kde_kws={'label': "KDE"},
fit_kws={'label':'norm_fit','color':'r'},ax=ax5)
ax5.legend();
工作日及早晚对比
ax6 = sns.boxplot(data=Data, x='week',y='bus_time',
color=".8")
ax6 = sns.stripplot(data=Data, x='week',y='bus_time',
hue='moment', order=['Mon','Tue','Wed','Tur','Fri'],
jitter=True, size=6, palette=['g','r']);
虽然数据太少,还是有点点规律的,至少周一到周五的平均值差不多是递减的,而且早上比晚上用时长。
时“距”图
time_data = np.zeros((len(Data),5),dtype='O')
for i,record in enumerate(raw_data):
time_data[i,0] = record[3]
morning = data['moment']=='m'
night = data['moment']=='n'
for i in range(3,7):
name = data.dtype.names[i]
miss = data[name] == 0
data[name][miss] = Data.iloc[miss, i]*60 # 这个地方要引起注意, structured array 的索引顺序要注意
time_data[morning, 1] = time_data[morning, 0] + data['walk_time1'][morning]
time_data[night, 1] = time_data[night, 0] + data['walk_time2'][night]
time_data[:, 2] = time_data[:, 1] + data['wait_time']
time_data[:, 3] = time_data[:, 2] + data['bus_time']
time_data[morning, 4] = time_data[morning, 3] + data['walk_time2'][morning]
time_data[night, 4] = time_data[night, 3] + data['walk_time2'][night]
time_ufunc = np.frompyfunc( lambda x: dt.datetime.fromtimestamp(x).time(), 1, 1)
time_data = time_ufunc(time_data)
ax7 = plt.plot(time_data[data['moment']=='m'].T, color='b');
plt.xticks(range(5),('家','到车站','上车','下车','单位'));
plt.plot(time_data[data['moment']=='n'].T, color='b');
plt.xticks(range(5),('单位','到车站','上车','下车','家'));
数据
Tue,m,49,IMG_20160713_071539,IMG_20160713_072725,IMG_20160713_073128,IMG_20160713_075851,IMG_20160713_081227
Tue,n,49,IMG_20160713_180352,IMG_20160713_181355,IMG_20160713_181735,IMG_20160713_184614,IMG_20160713_185826
Wed,m,71,IMG_20160714_072533,IMG_20160714_073650,IMG_20160714_074434,IMG_20160714_081433,IMG_20160714_082838
Wed,n,71,IMG_20160714_180144,IMG_20160714_181208,IMG_20160714_181216,IMG_20160714_184007,IMG_20160714_185207
Tur,m,71,IMG_20160715_072217,IMG_20160715_073350,IMG_20160715_073555,IMG_20160715_080228,IMG_20160715_081710
Tur,n,71,IMG_20160715_175304,IMG_20160715_180541,IMG_20160715_181607,IMG_20160715_184306,IMG_20160715_185510
Tur,m,49,IMG_20160721_072002,IMG_20160721_073235,,IMG_20160721_075720,IMG_20160721_081031
Tur,n,49,IMG_20160721_180613,IMG_20160721_181655,IMG_20160721_182710,IMG_20160721_185214,IMG_20160721_190331
Fri,m,49,IMG_20160722_073151,IMG_20160722_074356,IMG_20160722_075233,IMG_20160722_081713,IMG_20160722_083230
Fri,n,71,IMG_20160722_175006,IMG_20160722_180133,IMG_20160722_180302,,
Mon,m,49,IMG_20160725_072804,,IMG_20160725_074801,IMG_20160725_081555,
Mon,n,71,IMG_20160725_175529,IMG_20160725_180715,IMG_20160725_181002,IMG_20160725_183535,IMG_20160725_184736
Tue,m,71,IMG_20160726_072258,IMG_20160726_073350,IMG_20160726_073638,IMG_20160726_080054,IMG_20160726_081619
Tue,n,71,IMG_20160726_175722,IMG_20160726_181015,IMG_20160726_182006,IMG_20160726_184624,IMG_20160726_185831
Wed,m,49,IMG_20160727_071805,IMG_20160727_072739,IMG_20160727_072908,IMG_20160727_080022,IMG_20160727_081455
Wed,n,49,IMG_20160727_180114,IMG_20160727_181413,IMG_20160727_182320,IMG_20160727_184700,IMG_20160727_185746
Tur,m,49,IMG_20160728_071452,IMG_20160728_072511,IMG_20160728_073047,IMG_20160728_075837,IMG_20160728_081403
Tur,n,49,IMG_20160728_180028,IMG_20160728_180906,IMG_20160728_182056,IMG_20160728_184732,IMG_20160728_190028
Fri,m,49,IMG_20160729_073628,IMG_20160729_074616,IMG_20160729_074711,IMG_20160729_081552,IMG_20160729_082956
Fri,n,71,IMG_20160729_174821,IMG_20160729_180153,IMG_20160729_180710,IMG_20160729_183250,IMG_20160729_184502
Mon,m,71,IMG_20160801_072546,IMG_20160801_073721,IMG_20160801_073820,IMG_20160801_080751,IMG_20160801_082403
Mon,n,49,IMG_20160801_180036,IMG_20160801_181159,IMG_20160801_181611,IMG_20160801_184219,IMG_20160801_185446
Tue,m,49,IMG_20160802_072720,IMG_20160802_073719,IMG_20160802_073839,IMG_20160802_080553,IMG_20160802_082240
Tue,n,49,IMG_20160802_180337,IMG_20160802_181618,IMG_20160802_182343,IMG_20160802_184718,IMG_20160802_185858
Wed,m,71,IMG_20160803_072833,IMG_20160803_073920,IMG_20160803_074022,IMG_20160803_080610,IMG_20160803_082055
Wed,n,49,IMG_20160803_171638,IMG_20160803_173337,IMG_20160803_174315,IMG_20160803_180915,IMG_20160803_183030
Tur,m,49,IMG_20160804_071143,IMG_20160804_071934,IMG_20160804_072123,IMG_20160804_074614,IMG_20160804_081631
Tur,n,49,IMG_20160804_174958,IMG_20160804_180250,IMG_20160804_180904,IMG_20160804_183432,IMG_20160804_184934
Fri,m,49,IMG_20160805_073658,IMG_20160805_074736,IMG_20160805_074856,IMG_20160805_081728,IMG_20160805_083455
Fri,n,49,IMG_20160805_180415,IMG_20160805_181435,IMG_20160805_182532,IMG_20160805_185121,IMG_20160805_190333
Mon,m,49,IMG_20160808_073009,IMG_20160808_074126,IMG_20160808_074235,IMG_20160808_081407,IMG_20160808_083110
Mon,n,71,IMG_20160808_180458,IMG_20160808_181306,IMG_20160808_182640,IMG_20160808_184939,IMG_20160808_185949
Tue,n,71,IMG_20160809_180205,IMG_20160809_181401,IMG_20160809_182328,IMG_20160809_185103,IMG_20160809_190530