1.1 导入需要使用的包:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
1.2 利用pd.read_csv导入数据, 命名为cab
cab = pd.read_csv('/Users/tangyu/Desktop/test.csv')
1.3 查看cab的数据组成
cab.info
1.4 结果如下,注意1) 数据类型为DataFrame 2)列名和类型 3)内存使用情况
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 625134 entries, 0 to 625133
Data columns (total 9 columns):
id 625134 non-null object
vendor_id 625134 non-null int64
pickup_datetime 625134 non-null object
passenger_count 625134 non-null int64
pickup_longitude 625134 non-null float64
pickup_latitude 625134 non-null float64
dropoff_longitude 625134 non-null float64
dropoff_latitude 625134 non-null float64
store_and_fwd_flag 625134 non-null object
dtypes: float64(4), int64(2), object(3)
memory usage: 42.9+ MB
1.5 查看数据前/后5项
cab.head() # 前5项
cab.tail() # 后5项
len(cab.id) # 查看数据总数
>>> 625134
1.6 自定义函数Jan返回一月份日期,利用filter函数代替for循环
dates = cab['pickup_datetime']
pas = cab['passenger_count']
def jan(date):
return date <= '2016-01-31'
jan_list = filter(jan, dates)
1.7 np.unique函数计算每辆车运营的乘客人数
sum()计算总人数
利用for循环计算每一种人数客车的比例
pas_per_car = np.unique(cab['passenger_count']) # total_pas = sum(cab['passenger_count']) # pas_per_car_ratio = [] #
pas_per_car_counts = []
for i in pas_per_car:
pas_per_car_count = sum(cab['passenger_count'] == i)
pas_per_car_counts.append(pas_per_car_count)
ratio = pas_per_car_count/float(total_pas)
pas_per_car_ratio.append(ratio)
1.8 计算每个月的数据,注意 & 和 | 表示并和或
# 获取每个月的数据
cab_Jan = cab[cab['pickup_datetime'] <= '2016-02-01'] cab_Feb = cab[(cab['pickup_datetime'] <= '2016-03-01') & (cab['pickup_datetime'] >='2016-02-01')]
cab_May = cab[(cab['pickup_datetime'] <= '2016-04-01') & (cab['pickup_datetime'] >='2016-03-01')]
cab_Apr = cab[(cab['pickup_datetime'] <= '2016-05-01') & (cab['pickup_datetime'] >='2016-04-01')]
cab_Mar = cab[(cab['pickup_datetime'] <= '2016-06-01') & (cab['pickup_datetime'] >='2016-05-01')]
cab_Jun = cab[(cab['pickup_datetime'] <= '2016-07-01') & (cab['pickup_datetime'] >='2016-06-01')]
1.9 每个月乘客人数分别求;每个月乘客人数的最大值,可以看出每个月都一致
pas_list=[] # 每个月的passengers人数求和
pas_max_list = [] # 每个月人数最大值
pas_per_car_monthly_counts = [] #每个月的人数比较
for cab_mons in [cab_Jan, cab_Feb, cab_May, cab_Apr, cab_Mar, cab_Jun]: pas_count = sum(cab_mons['passenger_count']) #总人数
pas_list.append(pas_count) # list.append()
pas_max = max(cab_mons['passenger_count']) #
cab_max_monthly = cab_mons[cab_mons['passenger_count'] == pas_max]
pas_max_list.append(pas_max)
for i in pas_per_car:
pas_per_car_count_month = sum(cab_mons['passenger_count'] == i)
pas_per_car_monthly_counts.append(pas_per_car_count_month)
print pas_list
print pas_max_list
print pas_per_car_monthly_counts
[162880, 169805, 182752, 178058, 179140, 166191] [9, 6, 6, 6, 6, 9] [2, 69319, 13984, 3848, 1848, 5396, 3278, 1, 2, 72881, 14592, 4012, 1897, 5464, 3466, 0, 8, 77849, 15545, 4576, 2177, 5875, 3667, 0, 3, 76139, 15647, 4480, 2055, 5683, 3425, 0, 6, 75934, 15793, 4484, 2068, 5814, 3471, 0, 2, 71325, 14466, 4286, 1972, 5179, 3214, 1]
pas_per_car #每辆车的人数
>>> array([0, 1, 2, 3, 4, 5, 6, 9])
1.10 最后一列为N的共3430行数
sum(cab.store_and_fwd_flag !='N')
>>> 3430
作图部分
2.1 配色卡
import seaborn as sns sns.palplot(sns.cubehelix_palette(8, start=.5, rot=-.75))
本文使用的配色卡
2.2 不同人数的乘客比例分布
plt.figure(figsize=(7,7))
labels = pas_per_car
explode = (0, 0.1, 0.1, 0, 0, 0, 0, 0) # explode 1st slice
# Plot
plt.pie(pas_per_car_counts, explode=explode, labels=labels,
autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal')
plt.legend(loc='upper left', bbox_to_anchor=(-0.1, 1))
2.3 不同月份中不同人数的占比分布:基本一致
plt.figure(figsize=(20,20))
labels = pas_per_car
explode = (0, 0.1, 0.1, 0, 0, 0, 0, 0) # explode 1st slice
# Plot
y = 0;
xlabel = ['Jan', 'Feb', 'May', 'Apr', 'Mar', 'Jun']
for i in range(1,7):
plt.subplot(3,2,i)
plt.pie(pas_per_car_monthly_counts[y:y+8], explode=explode,
labels=labels,
autopct='%1.1f%%', shadow=True, startangle=140)
plt.axis('equal')
plt.xlabel(xlabel[i-1])
plt.legend(loc='upper left', bbox_to_anchor=(-0.1, 1))
y=y+8
2.4 发现有乘客为0的情况,可能的解释 1)这种情况可能是司机师傅没有打表计费; 2)乘车人取消了订单;3)数据错误
sum(cab['passenger_count'] == 0)
>>> 23
2.5 2016上半年总运载的乘客人数为1038826人次
total_pas
>>> 1038826
2.6 一月中乘客为9人的数据
cab_mons[cab_mons['passenger_count'] == pas_max]
2.7 一月到六月的乘客人数对比
# Plot
x = range(1,7) # 生成1~6的数组
plt.ylim((150000, 200000)) # 固定y的坐标轴值
plt.xlabel('Month') # 添加横坐标文字
plt.ylabel('Passenger count') # 添加纵坐标文字
sns.barplot(x, pas_list)
一月到六月的乘客人数条形图对比
仔细看第一天的情况
3.1 获取第一天的数据
cab_1st_day = cab[cab['pickup_datetime'] <= '2016-01-02'] # the first day
cab_hour = []
for i in range(24):
cab['pickup_datatime'] <=
sum_by_hour = sum(cab[])
>>> (array([ 2057., 0., 605., 0., 155., 0., 103., 0., 168., 93.]), array([ 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5, 5. , 5.5, 6. ]), )
3.2 一天的人流变化
# 一天的人流变化
import datetime
date = []
time = []
pas_hour = 0
pas_hours = []
date_point = datetime.datetime.strptime("2016-01-01 22:59:59", "%Y-%m-%d %H:%M:%S")
for i in range(0,len(cab_1st_day['pickup_datetime'])):
# for i in range(0,1100):
date_time = cab_1st_day['pickup_datetime'].tolist()[i]
date_obj = datetime.datetime.strptime(date_time,"%Y-%m-%d %H:%M:%S")
if date_obj > date_point:
pas_hour += sum(cab_1st_day[cab_1st_day['pickup_datetime'] == date_time].passenger_count) # 加该时间短的人数
else:
pas_hours.append(pas_hour)
pas_hour = 0
date_point -= datetime.timedelta(hours=1)# 1个小时作为一个间隔
pas_hour += sum(cab_1st_day[cab_1st_day['pickup_datetime'] == date_time].passenger_count) # 加该时间短的人数
# plot
x = range(1,len(pas_hours)+1)
#plt.ylim((150000, 200000))
plt.xlabel('hour')
plt.ylabel('Passenger count in the first day')
# plt.bar(x , pas_hours)
sns.barplot(x, pas_hours)
3.3 六月的情况
import datetime
date = []
time = []
pas_hour = 0
pas_hours = []
date_point = datetime.datetime.strptime("2016-06-30 23:59:59", "%Y-%m-%d %H:%M:%S")
for i in range(0,len(cab_Jun['id'])):
# for i in range(0,1100):
date_time = cab_Jun['pickup_datetime'].tolist()[i]
date_obj = datetime.datetime.strptime(date_time,"%Y-%m-%d %H:%M:%S")
if date_obj > date_point:
pas_hour += sum(cab_Jun[cab_Jun['pickup_datetime'] == date_time].passenger_count) # 加该时间短的人数
else:
pas_hours.append(pas_hour)
pas_hour = 0
date_point -= datetime.timedelta(days=1)# 1个小时作为一个间隔
pas_hour += sum(cab_Jun[cab_Jun['pickup_datetime'] == date_time].passenger_count) # 加该时间短的人数
len(cab_Jun['id']) # 六月的记录条数
>>> 100445
sum(cab_Jun['passenger_count']) # 六月的乘客数
画了最后三天的柱状图:
plt.figure(figsize=(10,10))
x = range(1,len(pas_hours)+1)
plt.xlabel('hour')
plt.ylabel('Passenger count in Jun')
sns.barplot(x, pas_hours)
三天的流量分布
根据经纬度画出地图
4.1 上车和下车的经纬度最大/最小值
lon_max = max(cab['pickup_longitude'])
lon_min = min(cab['pickup_longitude'])
lat_max = max(cab['pickup_latitude'])
lat_min = min(cab['pickup_latitude'])
print(lon_max)
print(lon_min)
print(lat_max)
print(lat_min)
>>> -69.248916626 -121.933128357 42.8149375916 37.3895874023
4.2 统计相关
import math
longitudes =list['pickup_longitude','dropoff_longitude']
for longitude in longitudes:
df['longitude']
4.3 首先利用matplotlib 和 basemap 画出地图
from mpl_toolkits.basemap import Basemap, cm
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
fig = plt.figure(figsize=(8,8))
ax = fig.add_axes([0.1,0.1,0.8,0.8])
# create polar stereographic Basemap instance.
# NYC地图
#m = Basemap(projection='stere',
# lon_0=-73.93, # 地图中心
# lat_0=40.65,
# llcrnrlat=40.2,
# urcrnrlat=40.9,
# llcrnrlon=-74.10,
# urcrnrlon=-73.75,
# resolution='l')
# 美国整体地图
m = Basemap(projection='stere',lat_0=90,lon_0=-105,
llcrnrlat=23.41 ,urcrnrlat=45.44,
llcrnrlon=-118.67,urcrnrlon=-64.52,
rsphere=6371200.,resolution='l',area_thresh=10000)
# draw coastlines, state and country boundaries, edge of map.
#m.drawcoastlines() # drew coastlines
m.drawstates() # draw states
m.drawcountries() # draw countries
#m.drawcounties() # draw conties
# draw parallels.
parallels = np.arange(0.,90,10.)
m.drawparallels(parallels,labels=[1,0,0,0],fontsize=10)
# draw meridians
#meridians = np.arange(180.,360.,10.)
#m.drawmeridians(meridians,labels=[0,0,0,1],fontsize=10)
meridians = np.arange(-110.,-60.,10.)
m.drawmeridians(meridians,labels=[0,0,0,1],fontsize=10)
m.fillcontinents(color = 'coral')
# ny = data.shape[0]; nx = data.shape[1]
#lons, lats = m.makegrid(cab_1st_day['pickup_longitude'], cab_1st_day['pickup_latitude']) # get lat/lons of ny by nx evenly space grid.
lons = np.array(cab_1st_day['pickup_longitude'])
lats = np.array(cab_1st_day['pickup_latitude'])
x, y = m(lons, lats)
#m.scatter(x,y,50,marker='.',color='b')
plt.scatter(x,y,50,cmap=cm.hsv, color ='b')
plt.scatter(lons,lats,50,cmap=cm.hsv, color = 'b') #
仔细看一天的地图
5.1 获取第一天的上下车经纬度和时间
lons = np.array(cab_1st_day['pickup_longitude'])
lats = np.array(cab_1st_day['pickup_latitude'])
time = np.array(cab_1st_day['pickup_datetime'])
5.2 第一天的上下车情况,以一个小时做为时间间隔,画出每个小时的散点图
# 一天的人流变化
import folium
import datetime
import time
import os
from selenium import webdriver
#date = []
#time = []
#pas_hour = 0
#pas_hours = []
j=1;
m = folium.Map(location=[40.8, -73.9],zoom_start=11)
#p_lons = np.array(cab_1st_day['pickup_longitude'])
#p_lats = np.array(cab_1st_day['pickup_latitude'])
d_lons = np.array(cab_1st_day['dropoff_longitude'])
d_lats = np.array(cab_1st_day['dropoff_latitude'])
date_point = datetime.datetime.strptime("2016-01-01 22:59:59", "%Y-%m-%d %H:%M:%S")
#for p_lat, p_lon, i in zip(p_lats, p_lons, range(0,len(p_lons))):
for d_lat, d_lon, i in zip(d_lats, d_lons, range(0,len(d_lons))):
# for i in range(0,1000):
date_time = cab_1st_day['pickup_datetime'].tolist()[i]
date_obj = datetime.datetime.strptime(date_time,"%Y-%m-%d %H:%M:%S")
if date_obj > date_point:
#m.add_child(folium.Circle(location=[p_lat,p_lon]))
m.add_child(folium.Circle(location=[d_lat,d_lon],color = '#FF0000'))
#pas_hour += sum(cab_1st_day[cab_1st_day['pickup_datetime'] == date_time].passenger_count) # 加该时间短的人数
else:
#pas_hours.append(pas_hour)
#pas_hour = 0
display(m)
#time.sleep(4)
lj = str(j)
fn = lj + '.html'
# fn='testmap.html'
tmpurl='file://{path}/{mapfile}'.format(path=os.getcwd(),mapfile=fn)
m.save(fn)
browser = webdriver.Safari()
browser.get(tmpurl)
#Give the map tiles some time to load
time.sleep(4)
browser.save_screenshot(lj + '.jpg')
browser.quit()
date_point -= datetime.timedelta(hours=1)# 1个小时作为一个间隔
#m.add_child(folium.Circle(location=[p_lat,p_lon]))
m.add_child(folium.Circle(location=[d_lat,d_lon],color = '#FF0000'))
j += 1
# pas_hour += sum(cab_1st_day[cab_1st_day['pickup_datetime'] == date_time].passenger_count) # 加该时间短的人数
好看多了
5.3 制作gif
from PIL import Image, ImageSequence
import sys, os
filenames=sorted(fn for fn in os.listdir('.') if fn.endswith('.jpg')) # 获取文件并排序
filenames.sort(key = lambda f: int(filter(str.isdigit, f))) # sort by int
# 再次按照int排序:保证排序
im=Image.open('1.jpg')
print(filenames)
im.save('traffic_1st_day_drop.gif', save_all=True, append_images= [Image.open(filename) for filename in filenames],loop=5,duration=500)