1、计算每日航班数量
计算每日航班数量
> daily <- flights %>%
+ mutate(date=make_date(year,month,day)) %>%
+ group_by(date) %>%
+ summarize(n=n())
`summarise()` ungrouping output (override with `.groups` argument)
> daily
# 可视化
> ggplot(daily,aes(date,n))+
+ geom_line()
- n()的用法:分组计数
> daily <- flights %>%
+ mutate(date=make_date(year,month,day)) %>%
+ group_by(date)
> daily
> daily
# A tibble: 336,776 x 20
# Groups: date [365]
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 1 517 515 2 830 819
2 2013 1 1 533 529 4 850 830
3 2013 1 1 542 540 2 923 850
4 2013 1 1 544 545 -1 1004 1022
5 2013 1 1 554 600 -6 812 837
6 2013 1 1 554 558 -4 740 728
7 2013 1 1 555 600 -5 913 854
8 2013 1 1 557 600 -3 709 723
9 2013 1 1 557 600 -3 838 846
10 2013 1 1 558 600 -2 753 745
> count(group_by(daily,date))
# A tibble: 365 x 2
# Groups: date [365]
date n
<date> <int>
1 2013-01-01 842
2 2013-01-02 943
3 2013-01-03 914
4 2013-01-04 915
5 2013-01-05 720
6 2013-01-06 832
7 2013-01-07 933
8 2013-01-08 899
9 2013-01-09 902
10 2013-01-10 932
数据显然有以周为单位的变化,这影响了数据的长期观察。
检查航班数量在每一天正宗的分布
> Sys.setlocale("LC_ALL","English")
[1] "LC_COLLATE=English_United States.1252;LC_CTYPE=English_United States.1252;LC_MONETARY=English_United States.1252;LC_NUMERIC=C;LC_TIME=English_United States.1252"
> daily <- daily %>%
+ mutate(wday=wday(date,label = T,locale = Sys.getlocale(category = "LC_TIME")))
> daily
# A tibble: 365 x 3
date n wday
<date> <int> <ord>
1 2013-01-01 842 Tue
2 2013-01-02 943 Wed
3 2013-01-03 914 Thu
4 2013-01-04 915 Fri
5 2013-01-05 720 Sat
6 2013-01-06 832 Sun
7 2013-01-07 933 Mon
8 2013-01-08 899 Tue
9 2013-01-09 902 Wed
10 2013-01-10 932 Thu
# ... with 355 more rows
> ggplot(daily,aes(wday,n))+
+ geom_boxplot()
必须把locale设为英文环境,否则lucbridate会根据当前环境读取中文,显示中文的星期几
消除强烈的模式——建立模型
> ## 拟合模型
> mod <- lm(n~wday,data = daily)
> grid <- daily %>%
+ data_grid(wday) %>%
+ add_predictions(mod,"n")
> ggplot(daily,aes(wday,n))+
+ geom_boxplot()+
+ geom_point(data = grid,aes(wday,n),color="red",size=4)
> ggplot(daily,aes(wday,n))+
+ geom_boxplot()+
+ geom_point(data = grid,color="red",size=4)
> grid <- daily %>%
+ data_grid(wday) %>%
+ add_predictions(mod,"n")
> ggplot(daily,aes(wday,n))+
+ geom_boxplot()+
+ geom_point(data = grid,color="red",size=4)
分析残差
> daily <- daily %>%
+ add_residuals(mod)
> daily %>%
+ ggplot(aes(date,resid))+
+ geom_ref_line(h=0)+
+ geom_line()
六月开始模型并不适用
→进一步分析——按照wday分别展示
按照wday分别展示
daily %>%
ggplot(aes(date,resid,color=wday))+
geom_ref_line(h=0)+
geom_line()
# 显示航班特别少的日期
daily %>%
filter(resid < -100)
显示更平滑的长期趋势: smooth()
daily %>%
ggplot(aes(date,resid))+
geom_ref_line(h=0)+
geom_line(color="grey50")+
geom_smooth(se=F,span=0.20)
季节性星期六效应
> daily %>%
+ filter(wday=="Sat") %>%
+ ggplot(aes(date,n))+
+ geom_point()+
+ geom_line()+
+ scale_x_date(
+ NULL,
+ date_breaks = "1 month",
+ date_labels = "%b"
+ )
labels = b% : 见strptime {base}
%b
Abbreviated month name in the current locale on this platform. (Also matches full name on input: in some locales there are no abbreviations of names.)
从图中可看出周六航班的季节变化。可能与学期假期相关。
按学期分类处理数据
> term <- function(date){
+ cut(date,
+ breaks = ymd(20130101,20130605,20130825,20140101),
+ labels = c("spring","summer","fall"))
+ }
> daily <- daily %>%
+ mutate(term=term(date))
>
> daily %>%
+ filter(wday == "Sat") %>%
+ ggplot(aes(date,n,color=term))+
+ geom_point(alpha=1/3)+
+ geom_line()+
+ scale_x_date(
+ NULL,
+ date_breaks = "1 month",
+ date_labels = "%b")
查看学期变量如何影响一周中其他wday
daily %>%
ggplot(aes(wday,n,color=term))+
geom_boxplot()
拟合去除每学期周内效应的模型
mod1 <- lm(n~wday,data = daily)
mod2 <- lm(n~wday * term,data = daily)
daily %>%
gather_residuals(without_term=mod1, with_term=mod2) %>%
ggplot(aes(date,resid,color=model))+
geom_line()
将预测值覆盖到数据上
grid <- daily %>%
data_grid(wday,term) %>%
add_predictions(mod2,"n")
ggplot(daily,aes(wday,n))+
geom_boxplot()+
geom_point(data = grid,color="red")+
facet_wrap(~term)
发现问题:离群点
解决:使用MASS::rlm()
处理离群点—— MASS::rlm()
library(MASS)
mod3 <- rlm(n~wday * term,data = daily)
daily %>%
add_residuals(mod3,"resid") %>%
ggplot(aes(date,resid))+
geom_hline(yintercept = 0,size=2,color="white")+
geom_line()