2020/03/31 Chapter 3 使用dplyr进行数据转换
选取的数据为2013从纽约市出发的航班信息
(需提前安装相应的"nycflights13"包)
install.packages("nycflights13")
library(nycflights13)
同时使用tidyverse包的核心R包 ——dplyr
install.packages("tidyverse")
library(tidyverse)
查看数据flights
nycflights13::flights
> nycflights13::flights
# A tibble: 336,776 x 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin
<int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr>
1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 EWR
2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 LGA
3 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK
4 2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB JFK
5 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA
6 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR
7 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR
8 2013 1 1 557 600 -3 709 723 -14 EV 5708 N829AS LGA
9 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK
10 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA
# ... with 336,766 more rows, and 6 more variables: dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>,
# minute <dbl>, time_hour <dttm>
5个核心函数
1.filter()
2.arrange()
3.select()
4.mutate()
5.summarize()
过滤数据
filter(nycflights13::flights,month == 1,day == 1)
> filter(nycflights13::flights,month == 1,day == 1)
# A tibble: 842 x 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin
<int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr>
1 2013 1 1 517 515 2 830 819 11 UA 1545 N14228 EWR
2 2013 1 1 533 529 4 850 830 20 UA 1714 N24211 LGA
3 2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK
4 2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB JFK
5 2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA
6 2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR
7 2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR
8 2013 1 1 557 600 -3 709 723 -14 EV 5708 N829AS LGA
9 2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK
10 2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA
# ... with 832 more rows, and 6 more variables: dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
# time_hour <dttm>
ja01 <- filter(nycflights13::flights,month == 1,day == 1) #赋值给对象 ja01
ja01
dec24 <- filter(nycflights13::flights,month == 12,day == 24) #筛选12月24日的航班
dec24
(dec24 <- filter(nycflights13::flights,month == 12,day == 24)) #筛选12月24日的航班 并输出到屏幕上
比较运算符 ">" ">=" "<" "<=" "!="不等于 "=="等于
near( )
near(sqrt(2)^2,2)
> near(sqrt(2)^2,2)
[1] TRUE
逻辑运算符
filter(nycflights13::flights,month==11|month==12) #找出11或12月份出发的所有航班
> filter(nycflights13::flights,month==11|month==12)
# A tibble: 55,403 x 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin
<int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr>
1 2013 11 1 5 2359 6 352 345 7 B6 745 N568JB JFK
2 2013 11 1 35 2250 105 123 2356 87 B6 1816 N353JB JFK
3 2013 11 1 455 500 -5 641 651 -10 US 1895 N192UW EWR
4 2013 11 1 539 545 -6 856 827 29 UA 1714 N38727 LGA
5 2013 11 1 542 545 -3 831 855 -24 AA 2243 N5CLAA JFK
6 2013 11 1 549 600 -11 912 923 -11 UA 303 N595UA JFK
7 2013 11 1 550 600 -10 705 659 6 US 2167 N748UW LGA
8 2013 11 1 554 600 -6 659 701 -2 US 2134 N742PS LGA
9 2013 11 1 554 600 -6 826 827 -1 DL 563 N912DE LGA
10 2013 11 1 554 600 -6 749 751 -2 DL 731 N315NB LGA
# ... with 55,393 more rows, and 6 more variables: dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
# time_hour <dttm>
nov_dec <- filter(nycflights13::flights,month %in% c(11,12)) # 使用 %in% 也可以
nov_dec
> nov_dec <- filter(nycflights13::flights,month %in% c(11,12))
> nov_dec
# A tibble: 55,403 x 19
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin
<int> <int> <int> <int> <int> <dbl> <int> <int> <dbl> <chr> <int> <chr> <chr>
1 2013 11 1 5 2359 6 352 345 7 B6 745 N568JB JFK
2 2013 11 1 35 2250 105 123 2356 87 B6 1816 N353JB JFK
3 2013 11 1 455 500 -5 641 651 -10 US 1895 N192UW EWR
4 2013 11 1 539 545 -6 856 827 29 UA 1714 N38727 LGA
5 2013 11 1 542 545 -3 831 855 -24 AA 2243 N5CLAA JFK
6 2013 11 1 549 600 -11 912 923 -11 UA 303 N595UA JFK
7 2013 11 1 550 600 -10 705 659 6 US 2167 N748UW LGA
8 2013 11 1 554 600 -6 659 701 -2 US 2134 N742PS LGA
9 2013 11 1 554 600 -6 826 827 -1 DL 563 N912DE LGA
10 2013 11 1 554 600 -6 749 751 -2 DL 731 N315NB LGA
# ... with 55,393 more rows, and 6 more variables: dest <chr>, air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>,
# time_hour <dttm>
filter(nycflights13::flights,!(arr_delay>120|dep_delay>120)) # 筛选延迟航班
filter(nycflights13::flights,arr_delay<=120,dep_delay<=120)
缺失值 NA(not available,不可用)
is.na(x) #确定x是否为缺失值
filter( )只会筛选出条件为True的行,他会排除那些条件为FALSE和NA的行
3.2.4练习题
filter(nycflights13::flights,arr_delay>=120) #筛选延迟航班
filter(nycflights13::flights,dest=="IAH"|dest=="HOU")#筛选飞往IAH或HOU的航班
filter(nycflights13::flights,carrier=="United"|carrier=="American"|carrier=="Delta")#筛选三个运营商的航班
filter(nycflights13::flights,month==7|month==8|month==9)#筛选夏季的航班
filter(nycflights13::flights,arr_delay>120,dep_delay==0)#筛选仅仅到达时间延迟的航班
filter(nycflights13::flights,arr_delay>=60,dep_delay<=30)#筛选延误至少一小时,但飞行过程弥补回30分钟的航班
filter(nycflights13::flights,dep_time<=600&dep_time>=0)#筛选0点至6点的航班
?dplyr::between()
微信截图_20200331200438.png
filter(nycflights13::flights,is.na(dep_time))#筛选 缺失值 的航班 ,表示航班取消
NA^0
#> NA^0
#[1] 1
#任何数的0次方均为1
NA|T
#> NA|T
#[1] TRUE
F&NA
#> F&NA
#[1] FALSE