小洁详解《R数据科学》第16章 Purrr下

4 for循环与函数式编程

library(tidyverse)
df <- tibble(
  a = rnorm(10),
  b = rnorm(10),
  c = rnorm(10),
  d = rnorm(10)
)

用for循环计算每列的均值

output <- vector("double", length(df))
for (i in seq_along(df)) {
output[[i]] <- mean(df[[i]])
}
output
#> [1] -0.3893274  0.2581997 -0.3238829  0.2181311

书中给出一个奇怪的例子来说明函数组合

f1 <- function(x) abs(x - mean(x)) ^ 1
f2 <- function(x) abs(x - mean(x)) ^ 2
f3 <- function(x) abs(x - mean(x)) ^ 3

#关于abs：
#绝对值函数，帮助文档里还有一句：
#abs(x) returns an integer vector when x is integer or logical.

abs(TRUE)
#> [1] 1
abs(FALSE)
#> [1] 0

f <- function(x, i) abs(x - mean(x)) ^ i
#其实这个函数应该是向量化的，x应该是个向量
#直接将df作为x会导致全是NA
f(df[[1]],3)
#>  [1] 3.418992e-01 1.620302e-01 5.253052e-01 4.321956e-02 6.489365e+00
#>  [6] 2.156025e+00 4.578027e-02 3.268124e-01 6.154311e-02 2.012281e-06

将函数作为参数传入另一个函数

col_summary <- function(df, fun) {
  out <- vector("double", length(df))
  for (i in seq_along(df)) {
    out[i] <- fun(df[[i]])
  }
  out
}
#操作对象是df了
col_summary(df,mean)
#> [1] -0.3893274  0.2581997 -0.3238829  0.2181311

col_summary(df, median)
#> [1] -0.2201724  0.2174858 -0.5452395  0.1845153

col_summary(df, sd)
#> [1] 0.9108518 1.1001024 1.1995958 0.6618306

5.映射函数-map

这是一个强大的函数族,直接返回结果

• map() 用于输出列表；
• map_lgl() 用于输出逻辑型向量；
• map_int() 用于输出整型向量；
• map_dbl() 用于输出双精度型向量；
• map_chr() 用于输出字符型向量。

map_dbl(df,mean)
#>          a          b          c          d 
#> -0.3893274  0.2581997 -0.3238829  0.2181311
map_dbl(df,median)
#>          a          b          c          d 
#> -0.2201724  0.2174858 -0.5452395  0.1845153
map_dbl(df,sd)
#>         a         b         c         d 
#> 0.9108518 1.1001024 1.1995958 0.6618306
#偷偷试一下指定dbl会怎么样
map(df,mean)
#> $a
#> [1] -0.3893274
#> 
#> $b
#> [1] 0.2581997
#> 
#> $c
#> [1] -0.3238829
#> 
#> $d
#> [1] 0.2181311

#返回了一个列表。还是指定数据类型比较好

可以支持管道操作

df %>% map_dbl(mean)
#>          a          b          c          d 
#> -0.3893274  0.2581997 -0.3238829  0.2181311
df %>% map_dbl(median)
#>          a          b          c          d 
#> -0.2201724  0.2174858 -0.5452395  0.1845153
df %>% map_dbl(sd)
#>         a         b         c         d 
#> 0.9108518 1.1001024 1.1995958 0.6618306

创建线性模型（暂时不用深究，知道实现了什么即可）

mtcars$cyl
#>  [1] 6 6 4 6 8 6 8 4 4 6 6 8 8 8 8 8 8 4 4 4 4 8 8 8 8 4 4 4 8 6 8 4

models <- mtcars %>%
split(.$cyl) %>%
map(function(df) lm(mpg ~ wt, data = df))

提取模型的 r.squared--拟合度

models %>%
map(summary) %>%
map_dbl(~.$r.squared) #入门
#>         4         6         8 
#> 0.5086326 0.4645102 0.4229655

models %>%
map(summary) %>%
map_dbl("r.squared") #升级方法
#>         4         6         8 
#> 0.5086326 0.4645102 0.4229655

x <- list(list(1, 2, 3), list(4, 5, 6), list(7, 8, 9))
x %>% map_dbl(2)
#> [1] 2 5 8

5.2R基础包

提取每个向量中＞0.8的数

x1 <- list(
c(0.27, 0.37, 0.57, 0.91, 0.20),
c(0.90, 0.94, 0.66, 0.63, 0.06),
c(0.21, 0.18, 0.69, 0.38, 0.77)
)
x2 <- list(
c(0.50, 0.72, 0.99, 0.38, 0.78),
c(0.93, 0.21, 0.65, 0.13, 0.27),
c(0.39, 0.01, 0.38, 0.87, 0.34)
)

threshold <- function(x, cutoff = 0.8) x[x > cutoff]
x1 %>% sapply(threshold) %>% str()
#> List of 3
#>  $ : num 0.91
#>  $ : num [1:2] 0.9 0.94
#>  $ : num(0)
x2 %>% sapply(threshold) %>% str()
#>  num [1:3] 0.99 0.93 0.87

盯了一会儿想明白了，为什么同样类型的两个list输出结果一个是list，一个是向量呢？

这两个list的区别，x1中三个向量，符合要求的数值各不相同，而x2中符合要求的个数都是1。所以自动选择了输出形式，这就是上面说的“你不知道会得到什么样的输出”。

关于替代方式vapply，并没有给出具体例子来理解，这里的重点是purrr而非基础包。

safe_log <- safely(log)
str(safe_log(10))
#> List of 2
#>  $ result: num 2.3
#>  $ error : NULL
#str(safe_log("a"))

safely函数用于查看是否有错。
返回result和error两个结果。二者不可兼得哈哈。

safely() 也可以与 map() 函数共同使用

x <- list(1, 10, "a")
y <- x %>% map(safely(log))
str(y)
#> List of 3
#>  $ :List of 2
#>   ..$ result: num 0
#>   ..$ error : NULL
#>  $ :List of 2
#>   ..$ result: num 2.3
#>   ..$ error : NULL
#>  $ :List of 2
#>   ..$ result: NULL
#>   ..$ error :List of 2
#>   .. ..$ message: chr "non-numeric argument to mathematical function"
#>   .. ..$ call   : language log(x = x, base = base)
#>   .. ..- attr(*, "class")= chr [1:3] "simpleError" "error" "condition"
#此时结果是按照元素汇总成表格，更理想的情况是按照正误分别汇总，用导函数transpose。
#返回2个list，正确错误各自汇总到一起。
y <- y %>% transpose() 
str(y)
#> List of 2
#>  $ result:List of 3
#>   ..$ : num 0
#>   ..$ : num 2.3
#>   ..$ : NULL
#>  $ error :List of 3
#>   ..$ : NULL
#>   ..$ : NULL
#>   ..$ :List of 2
#>   .. ..$ message: chr "non-numeric argument to mathematical function"
#>   .. ..$ call   : language log(x = x, base = base)
#>   .. ..- attr(*, "class")= chr [1:3] "simpleError" "error" "condition"

is_ok <- y$error %>% map_lgl(is_null)
x[!is_ok]
#> [[1]]
#> [1] "a"
#返回出错的原始数据,运行正确的显示结果。
y$result[is_ok] %>% flatten_dbl()
#> [1] 0.000000 2.302585

另外两个函数：possibly和quiet
possibly：成功时返回结果，失败时返回默认值。
quietly，result包括result output warnings messages

x <- list(1, 10, "a")
x %>% map_dbl(possibly(log, NA_real_))
#> [1] 0.000000 2.302585       NA
x %>% map_dbl(possibly(log,FALSE ))
#> [1] 0.000000 2.302585 0.000000
#后面这行是我乱写的，逻辑值被转换成了0，这个参数是设置错误返回的默认值。

x <- list(1,10)
x %>% map(quietly(log)) %>% str()
#> List of 2
#>  $ :List of 4
#>   ..$ result  : num 0
#>   ..$ output  : chr ""
#>   ..$ warnings: chr(0) 
#>   ..$ messages: chr(0) 
#>  $ :List of 4
#>   ..$ result  : num 2.3
#>   ..$ output  : chr ""
#>   ..$ warnings: chr(0) 
#>   ..$ messages: chr(0)

#这个不能存在字符串元素，否则报错

7. 多参数映射

p236
一个参数多次改变，生成多组数据

mu <- list(5, 10, -3)
mu %>%
  map(rnorm,n=5) %>%
  str()
#> List of 3
#>  $ : num [1:5] 6.06 5.06 4.28 4.67 3.58
#>  $ : num [1:5] 9.68 11.99 9.72 11.04 10.51
#>  $ : num [1:5] -3.03 -3.12 -1.79 -2.31 -2.84

关于rnorm，有三个参数，个数,均值，标准差。他们的默认顺序是个数-均值-标准差，想要改变顺序，需要指定参数名。

2个参数多次改变，生成多组数据

rnorm(10)
#>  [1]  0.65730448  0.09405843 -0.97479908 -1.28342701  0.67344511
#>  [6] -0.13338635 -1.62213469 -0.27219989 -0.36645900  2.14604270
rnorm(5,10)
#> [1] 10.531237 11.055287 10.223192 10.169048  9.877334
rnorm(5,n=10)
#>  [1] 5.890862 4.368264 5.896254 5.711182 4.555885 5.992307 6.263651
#>  [8] 4.281883 3.434235 4.508614
x <- rnorm(5,20,n=100)
sd(x)
#> [1] 20.61705
mean(x)
#> [1] 4.588476

同一个函数中两个参数不同，用map2
三个以上参数不同，用pmap，需要用多个等长列表保存不同参数，最好使用命名参数。可用tribble保存。
函数不同，参数也不同，用invoke_map()

8.游走函数

重要的是保存结果，如多张ggplot绘图保存（小本本已记下）

library(ggplot2)
     plots <- mtcars %>%
       split(.$cyl) %>%
       map(~ggplot(., aes(mpg, wt)) + geom_point())
     paths <- stringr::str_c(names(plots), ".pdf")
pwalk(list(paths, plots), ggsave, path = tempdir())
#> Saving 7 x 7 in image
#> Saving 7 x 7 in image
#> Saving 7 x 7 in image

9.for循环的其他模式

（1）预测函数

keep和discard
保留true或false对应的元素
some和every
对某个元素是否为真，对所有元素是否为真
我的理解，some是：是否存在为真的元素。

x <- list(1:5, letters, list(10))
x %>%
       some(is_integer) 
#> [1] TRUE
x %>%
       some(is_character)  
#> [1] TRUE

detect和detect_index
第一个true的值和位置
head_while tail_while
据我目测最后两个函数应该是各自查找一半。

(2)归约与累计

reduce和accumulate

reduce() 函数使用一个“二元”函数(即具有两个基本输入的函数)，将其不断应用于一个列表，直到最后只剩下一个元素为止。
累计函数与归约函数很相似，但前者会保留所有中间结果。

微信公众号生信星球同步更新我的文章，欢迎大家扫码关注！

我们有为生信初学者准备的学习小组,点击查看◀️
想要参加我的线上线下课程，也可加好友咨询🔼
如果需要提问，请先看生信星球答疑公告