这是stringr包中常用的字符串处理函数,字符串处理好了,简直是利器在手。
str_length获得字符串长度
> b<-c('abc','456','aini')
> str_length(b)
[1] 3 3 4
> length(b)
[1] 3
str_sub截断字符串
使用sub_str()访问单个字符。它有三个参数:字符向量、起始位置和结束位置。任何一个位置都可以是一个正整数,从长度计算,或者是一个负整数,从右边计算。
x <- c("abcdef", "ghifjk")
str_sub(x, 3, 3)
#> [1] "c" "i"
str_sub(x, 2, -2)
#> [1] "bcde" "hifj"
str_sub可以修改字符
str_sub(x, 3, 3) <- "X"
x
#> [1] "abXdef" "ghXfjk"
str_pad通过增加空格来将字符串长度在某一长度
x <- c("abc", "defghi")
str_pad(x, 10)
#> [1] " abc" " defghi"
str_pad(x, 10, "both")
#> [1] " abc " " defghi "
#str_pad不会将字符长度缩短
str_pad(x, 4)
#> [1] " abc" "defghi"
x <- c("Short", "This is a long string")
x %>%
str_trunc(10) %>%
str_pad(10, "right")
#> [1] "Short " "This is..."
str_trim与str_pad相反,删除空格
x <- c(" a ", "b ", " c")
str_trim(x)
#> [1] "a" "b" "c"
str_trim(x, "left")
#> [1] "a " "b " "c"
str_wrap包装一段文字,使每一行的长度尽可能相似。
jabberwocky <- str_c(
"`Twas brillig, and the slithy toves ",
"did gyre and gimble in the wabe: ",
"All mimsy were the borogoves, ",
"and the mome raths outgrabe. "
)
cat(str_wrap(jabberwocky, width = 40))
#> `Twas brillig, and the slithy toves did
#> gyre and gimble in the wabe: All mimsy
#> were the borogoves, and the mome raths
#> outgrabe.
str_to_upper、str_to_lower大小写和str_to_title首字母
x <- "I like horses."
str_to_upper(x)
#> [1] "I LIKE HORSES."
str_to_title(x)
#> [1] "I Like Horses."
str_to_lower(x)
#> [1] "i like horses."
# Turkish has two sorts of i: with and without the dot
str_to_lower(x, "tr")
#> [1] "ı like horses."
str_order()和str_sort()对字符向量排序
str_order和str_sort的区别在于前者返回排序后的索引(下标),后者返回排序后的实际值
x <- c("y", "i", "k")
str_order(x)
#> [1] 2 3 1
str_sort(x)
#> [1] "i" "k" "y"
# In Lithuanian, y comes between i and k
str_sort(x, locale = "lt")
#> [1] "i" "y" "k"
str_detect()str_subset()检测字符串中是否存在某种匹配模
str_detect()检测模式的存在与否,并返回逻辑向量(类似于grepl())。str_子集()返回与正则表达式(类似于grep()的value = TRUE)匹配的字符向量的元素)。
fruit <- c("apple", "banana", "pear", "pinapple")
str_detect(fruit, "a")
[1] TRUE TRUE TRUE TRUE
str_detect(fruit, "^a")
[1] TRUE FALSE FALSE FALSE
str_detect("aecfg", letters)
[1] TRUE FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
[12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[23] FALSE FALSE FALSE FALSE
> str_subset(fruit, "a")
[1] "apple" "banana" "pear" "pinapple"
> str_subset(fruit, "a$")
[1] "banana"
str_count计算匹配数
fruit <- c("apple", "banana", "pear", "pineapple")
str_count(fruit, "a")
## [1] 1 3 1 1
str_count(fruit, "p")
## [1] 2 0 1 3
str_count(fruit, c("a", "b", "p", "p"))
## [1] 1 1 1 3
str_locate和str_locate_all()定位
str_locate()定位模式的第一个位置,并返回一个包含列start和end的数字矩阵。str_locate_all()查找所有匹配项,返回一个数字矩阵列表。类似于regexpr()和gregexpr()。
> x <- c("abcdef", "ghifjk")
> str_locate(x, "def")
start end
[1,] 4 6
[2,] NA NA
> str_locate(x, "fjk")
start end
[1,] NA NA
[2,] 4 6
str_locate_all(c("abcdefabc", "ghifjkabc"), "abc")
[[1]]
start end
[1,] 1 3
[2,] 7 9
[[2]]
start end
[1,] 7 9
str_extract和str_extract_all提取匹配字符串
str_extract()提取与第一个匹配项对应的文本,返回一个字符向量。str_extract_all()提取所有匹配项并返回字符向量列表。
shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2")
str_extract(shopping_list, "\\d")
## [1] "4" NA NA "2"
str_extract(shopping_list, "[a-z]+")
## [1] "apples" "bag" "bag" "milk"
str_extract(shopping_list, "[a-z]{1,4}")
## [1] "appl" "bag" "bag" "milk"
str_extract(shopping_list, "\\b[a-z]{1,4}\\b")
## [1] NA "bag" "bag" "milk"
> str_extract_all(shopping_list, "[a-z]+")
[[1]]
[1] "apples" "x"
[[2]]
[1] "bag" "of" "flour"
[[3]]
[1] "bag" "of" "sugar"
[[4]]r
[1] "milk" "x"
str_replace和str_replace_all字符串替换
str_replace()替换第一个匹配的模式并返回一个字符向量。str_replace_all()替换所有匹配项。类似于sub()和gsub()。
> string<-str_replace('1989.03.24','\\.','-')
> string
[1] "1989-03.24"
> string<-str_replace_all('1989.03.24','\\.','-')
> string
[1] "1989-03-24"
str_split和str_split_fixed字符串分割
str_split_fixed()根据模式将字符串分割成固定数量的片段,并返回一个字符矩阵。str_split()将字符串分割成可变数量的片段,并返回一个字符向量列表。
str_split("a-b-c", "-")
#> [[1]]
#> [1] "a" "b" "c"
str_split_fixed("a-b-c", "-", n = 2)
#> [,1] [,2]
#> [1,] "a" "b-c"