R语言学习笔记总结

R语言初步-用dplyr进行数据转换

install.packages("tidyverse")
install.packages("nycflights13")#仍然记得要先安装
library(nycflights13)#航班信息文件
library(tidyverse)

?flights#查看数据信息的说明书
flights#查看航班信息

5、其他常用的摘要函数

之前使用了均值、求和和计数

5.1、位置度量：median()函数

median()用法和mean()类似，只不过是中位数而已

Not_cancelled <- flights %>%
  filter(!is.na(dep_delay),!is.na(arr_delay))

Not_cancelled %>%
  group_by(year,month,day)%>%
  summarise(
    #平均延误时间
    avg_delay1=mean(arr_delay),
    #平均正延误时间
    avg_delay2=mean(arr_delay[arr_delay>0]),
  )

#运行：
A tibble: 365 x 5
# Groups:   year, month [12]
    year month   day avg_delay1 avg_delay2
   <int> <int> <int>      <dbl>      <dbl>
 1  2013     1     1     12.7         32.5
 2  2013     1     2     12.7         32.0
 3  2013     1     3      5.73        27.7
 4  2013     1     4     -1.93        28.3
 5  2013     1     5     -1.53        22.6
 6  2013     1     6      4.24        24.4
 7  2013     1     7     -4.95        27.8
 8  2013     1     8     -3.23        20.8
 9  2013     1     9     -0.264       25.6
10  2013     1    10     -5.90        27.3
# ... with 355 more rows

5.2、分散程度度量：sd()、IQR()、mad()函数

sd():标准误差函数：standard deviation，分散程度的标准度量方式
IQR():四分位距
mad()：绝对中位差

注：mad()与IQR()基本等价，但是IQR()更适合有离群点的情况。


Not_cancelled %>%
  group_by(dest)%>%
  summarise(
    distance_sd=sd(distance))%>%  #计算distance列的标准误差
    arrange(desc(distance_sd)             #降序排序此行
  )                           

#运行：
A tibble: 104 x 2
   dest  distance_sd
   <chr>       <dbl>
 1 EGE         10.5 
 2 SAN         10.4 
 3 SFO         10.2 
 4 HNL         10.0 
 5 SEA          9.98
 6 LAS          9.91
 7 PDX          9.87
 8 PHX          9.86
 9 LAX          9.66
10 IND          9.46
# ... with 94 more rows

5.3、秩的度量：min()、quantile()、max()函数

quantile():分位数函数，是中位数函数的拓展
使用说明：quantile(x,0.25)是指将x按从小到大顺序排列，找到大于前25%，小于后75%的值。

#每天最早和最晚的航班是是什么时候：

Not_cancelled %>%
  group_by(year,month,day)%>%  #先按时间分组
  summarise(
    first=min(dep_time),  #最小值
    last=max(dep_time)    #最大值
    )

#运行：
A tibble: 365 x 5
# Groups:   year, month [12]
    year month   day first  last
   <int> <int> <int> <int> <int>
 1  2013     1     1   517  2356
 2  2013     1     2    42  2354
 3  2013     1     3    32  2349
 4  2013     1     4    25  2358
 5  2013     1     5    14  2357
 6  2013     1     6    16  2355
 7  2013     1     7    49  2359
 8  2013     1     8   454  2351
 9  2013     1     9     2  2252
10  2013     1    10     3  2320
# ... with 355 more rows

5.4、定位度量：first()、nth()、last()函数

这三个函数的作用相当于x[1]、x[2]、x[length(x)]
通过此函数也可以找出最早和最晚出发的航班


Not_cancelled %>%
  group_by(year,month,day)%>%  
  summarise(
    first_dep=first(dep_time), 
    last_dep=last(dep_time)    
  )

#运行：
# A tibble: 365 x 5
# Groups:   year, month [12]
    year month   day first_dep last_dep
   <int> <int> <int>     <int>    <int>
 1  2013     1     1       517     2356
 2  2013     1     2        42     2354
 3  2013     1     3        32     2349
 4  2013     1     4        25     2358
 5  2013     1     5        14     2357
 6  2013     1     6        16     2355
 7  2013     1     7        49     2359
 8  2013     1     8       454     2351
 9  2013     1     9         2     2252
10  2013     1    10         3     2320
# ... with 355 more rows

5.5、计数n(),count()

n():不需要任何参数，返回当前分组的大小
sum(!is.na(x)):计算非缺失值的数量
n_distinct(x):计算唯一值的数量
count()函数：用于只需要计数的情况

例如：
计算哪个目的地有最多的航空公司？

Not_cancelled %>%
  group_by(dest)%>%  
  summarise(
    carriers=n_distinct(carrier))%>%
  arrange(desc(carriers))

#运行：
A tibble: 104 x 2
   dest  carriers
   <chr>    <int>
 1 ATL          7
 2 BOS          7
 3 CLT          7
 4 ORD          7
 5 TPA          7
 6 AUS          6
 7 DCA          6
 8 DTW          6
 9 IAD          6
10 MSP          6
# ... with 94 more rows

count()函数用法举例：计算目的地不同的飞机数量

Not_cancelled %>%
  count(dest)

#运行：
#A tibble: 104 x 2
   dest      n
   <chr> <int>
 1 ABQ     254
 2 ACK     264
 3 ALB     418
 4 ANC       8
 5 ATL   16837
 6 AUS    2411
 7 AVL     261
 8 BDL     412
 9 BGR     358
10 BHM     269
# ... with 94 more rows

count()函数中可以添加加权变量，例如distance，用于计算飞机飞行里程（相当于求和）

Not_cancelled %>%
  count(tailnum,wt=distance)

#运行：
# A tibble: 4,037 x 2
   tailnum      n
   <chr>    <dbl>
 1 D942DN    3418
 2 N0EGMQ  239143
 3 N10156  109664
 4 N102UW   25722
 5 N103US   24619
 6 N104UW   24616
 7 N10575  139903
 8 N105UW   23618
 9 N107US   21677
10 N108UW   32070
# ... with 4,027 more rows

5.6、逻辑值的计数和比例

当需要用数值表示结果，TRUE=1，FALSE=0。
sum():可以找出TRUE的数量
mean():可以找出比例

以下一例：找出出发时间小于5:00的航班总数

Not_cancelled %>%
  group_by(year,month,day)%>%  
  summarise(
    n_nearly=sum(dep_time<500) #出发时间小于5:00的航班总数 
  )

#运行：
# A tibble: 365 x 4
# Groups:   year, month [12]
    year month   day n_nearly
   <int> <int> <int>    <int>
 1  2013     1     1        0
 2  2013     1     2        3
 3  2013     1     3        4
 4  2013     1     4        3
 5  2013     1     5        3
 6  2013     1     6        2
 7  2013     1     7        2
 8  2013     1     8        1
 9  2013     1     9        3
10  2013     1    10        3
# ... with 355 more rows

sum(dep_time<500)换成count(dep_time<500)是没有用的，sum相当于计算了返回值1，而dep_time<50这样的逻辑表达，count()函数是不支持的，其中牵扯到数据的逻辑。

以下一例：找出延误超过一小时的航班比例

Not_cancelled %>%
     group_by(year,month,day)%>%  
     summarise(
         hour_perc=mean(arr_delay>60)  #延误超过一小时的航班
       )

#运行：
# A tibble: 365 x 4
# Groups:   year, month [12]
    year month   day hour_perc
   <int> <int> <int>     <dbl>
 1  2013     1     1    0.0722
 2  2013     1     2    0.0851
 3  2013     1     3    0.0567
 4  2013     1     4    0.0396
 5  2013     1     5    0.0349
 6  2013     1     6    0.0470
 7  2013     1     7    0.0333
 8  2013     1     8    0.0213
 9  2013     1     9    0.0202
10  2013     1    10    0.0183
# ... with 355 more rows

                                    学习R语言真的好快乐哈哈

生活也不过如此.jpg

R语言初步-数据转换-6.summarise()函数的综合运用