《R for Data Science》第十一章 Data import 啃书知识点积累
参考链接:R for Data Science
数据导入
- 常用函数
read_csv()
reads comma delimited files,read_csv2()
reads semicolon separated files,read_tsv()
reads tab delimited files, andread_delim()
reads in files with any delimiter.read_fwf()
reads fixed width files. You can specify fields either by their widths with fwf_widths() or their position with fwf_positions().read_table()
reads a common variation of fixed width files where columns are separated by white space.read_log()
reads Apache style log files.
Apache日志参考:Apache日志详解 (一般用不到)
- 读取和特殊创建
# It prints out a column specification that gives the name and type of each column.
heights <- read_csv("data/heights.csv")
#> Parsed with column specification:
#> cols(
#> earn = col_double(),
#> height = col_double(),
#> sex = col_character(),
#> ed = col_double(),
#> age = col_double(),
#> race = col_character()
#> )
# 利用读取函数创建tibble
# 注意换行后不需要加,号
read_csv("a,b,c
1,2,3
4,5,6")
#> # A tibble: 2 x 3
#> a b c
#> <dbl> <dbl> <dbl>
#> 1 1 2 3
#> 2 4 5 6
# 等价于
read_csv("a,b,c\n1,2,3\n4,5,6")
# 按行创建tibble的经典方法
ts <- tribble(
~a, ~b, ~c,
#--/--/----
1, 2, 3,
4, 5, 6
)
- read_csv()常用参数
# skip 跳过指定数量的行
read_csv("The first line of metadata
The second line of metadata
x,y,z
1,2,3", skip = 2)
#> # A tibble: 1 x 3
#> x y z
#> <dbl> <dbl> <dbl>
#> 1 1 2 3
# comment 识别以comment开头的行为注释行,跳过
read_csv("# A comment I want to skip
x,y,z
1,2,3", comment = "#")
#> # A tibble: 1 x 3
#> x y z
#> <dbl> <dbl> <dbl>
#> 1 1 2 3
# col_names为FALSE时不识别第一行为列名,默认为TRUE
read_csv("1,2,3\n4,5,6", col_names = FALSE)
#> # A tibble: 2 x 3
#> X1 X2 X3
#> <dbl> <dbl> <dbl>
#> 1 1 2 3
#> 2 4 5 6
# col_names指定具体向量可以重新命名
read_csv("1,2,3\n4,5,6", col_names = c("x", "y", "z"))
#> # A tibble: 2 x 3
#> x y z
#> <dbl> <dbl> <dbl>
#> 1 1 2 3
#> 2 4 5 6
# na 将文件中满足条件的数据识别为NA
read_csv("a,b,c\n1,2,.", na = ".")
#> # A tibble: 1 x 3
#> a b c
#> <dbl> <dbl> <lgl>
#> 1 1 2 NA
# quote 指定符号用于限定字符串
# Single character used to quote strings.
read_csv("x,y\n1,'a,b'",quote="'")
# # A tibble: 1 x 2
# x y
# <dbl> <chr>
# 1 1 a,b
# 不指定quote会按照delim截断超过列数的列
read_csv("x,y\n1,'a,b'")
# # A tibble: 1 x 2
# x y
# <dbl> <chr>
# 1 1 'a
# locale 中指定 encoding 编码类型
read_csv('数据1.csv',
locale = locale(encoding = "GBK"))
- read_csv和原生read.csv的区别
- They are typically much faster (~10x) than their base equivalents.
- They produce tibbles.
- They don’t convert character vectors to factors.
- They are more reproducible. Base R functions inherit some behaviour from your operating system and environment variables, so import code that works on your computer might not work on someone else’s.
- Q: Apart from file, skip, and comment, what other arguments do read_csv() and read_tsv() have in common?
# read_csv() and read_tsv() are special cases of the general read_delim(). They're useful for reading the most common types of flat file data, comma separated values and tab separated values, respectively.
# 以下参数二者公有
read_csv(file, col_names = TRUE, col_types = NULL,
locale = default_locale(), na = c("", "NA"), quoted_na = TRUE,
quote = "\"", comment = "", trim_ws = TRUE, skip = 0,
n_max = Inf, guess_max = min(1000, n_max),
progress = show_progress(), skip_empty_rows = TRUE)
read_tsv(file, col_names = TRUE, col_types = NULL,
locale = default_locale(), na = c("", "NA"), quoted_na = TRUE,
quote = "\"", comment = "", trim_ws = TRUE, skip = 0,
n_max = Inf, guess_max = min(1000, n_max),
progress = show_progress(), skip_empty_rows = TRUE)
解析数据
parse_*()
: These functions take a character vector and return a more specialised vector like a logical, integer, or date
str(parse_logical(c("TRUE", "FALSE", "NA")))
#> logi [1:3] TRUE FALSE NA
str(parse_integer(c("1", "2", "3")))
#> int [1:3] 1 2 3
str(parse_date(c("2010-01-01", "1979-10-14")))
#> Date[1:2], format: "2010-01-01" "1979-10-14"
- 解析异常及处理
parse_integer(c("1", "231", ".", "456"), na = ".")
#> [1] 1 231 NA 456
parse_integer(c("123", "345", "abc", "123.45"))
#> [1] 123 345 NA NA
#> attr(,"problems")
#> # A tibble: 2 x 4
#> row col expected actual
#> <int> <int> <chr> <chr>
#> 1 3 NA an integer abc
#> 2 4 NA no trailing characters .45
用problems(x)
捕获异常形成tibble
x <- parse_integer(c("123", "345", "abc", "123.45"))
problems(x)
#> # A tibble: 2 x 4
#> row col expected actual
#> <int> <int> <chr> <chr>
#> 1 3 NA an integer abc
#> 2 4 NA no trailing characters .45
- 解析数字
- parse_number可以只提取数字而忽略数字旁的符号
parse_number("$100")
#> [1] 100
parse_number("20%")
#> [1] 20
parse_number("It cost $123.45")
#> [1] 123.45
- 分组符号和小数符号
parse_double("1.23")
#> [1] 1.23
parse_double("1,23", locale = locale(decimal_mark = ","))
#> [1] 1.23
# Used in America
parse_number("$123,456,789")
#> [1] 1.23e+08
# Used in many parts of Europe
parse_number("123.456.789", locale = locale(grouping_mark = "."))
#> [1] 1.23e+08
# Used in Switzerland
parse_number("123'456'789", locale = locale(grouping_mark = "'"))
#> [1] 1.23e+08
Tips
(1) decimal_mark 和 grouping_mark相同会报错
parse_number("1,23", locale = locale(decimal_mark = ","))
# [1] 1.23
parse_number("1,23", locale = locale(grouping_mark = ","))
# [1] 123
parse_number("1,23", locale = locale(
grouping_mark = ",",
decimal_mark = ","))
# 错误: `decimal_mark` and `grouping_mark` must be different
- decimal_mark默认为
.
,grouping_mark默认为,
,如果占位了对方的默认符号可视作交换
# decimal_mark为','时grouping_mark默认为'.'
parse_number("2.221,23", locale = locale(decimal_mark = ","))
# [1] 2221.23
# grouping_mark为'.'时decimal_mark默认为','
parse_number("2.221,23", locale = locale(grouping_mark = "."))
# [1] 2221.23
-
integer
double
number
对比
parse_logical(c("TRUE", "FALSE", "1", "0", "true", "t", "NA"))
# [1] TRUE FALSE TRUE FALSE TRUE TRUE NA
parse_integer(c("1235", "0134", "NA"))
# [1] 1235 134 NA
parse_number(c("1235", "0134", "NA"))
# [1] 1235 134 NA
parse_double(c("1235", "0134", "NA"))
# [1] 1235 134 NA
parse_integer(c("1000", "$1,000", "10.00"))
# Warning: 2 parsing failures.
# row col expected actual
# 2 -- an integer $1,000
# 3 -- no trailing characters .00
#
# [1] 1000 NA NA
# attr(,"problems")
# # A tibble: 2 x 4
# row col expected actual
# <int> <int> <chr> <chr>
# 1 2 NA an integer $1,000
# 2 3 NA no trailing characters .00
parse_number(c("1000", "$1,000", "10.00"))
# [1] 1000 1000 10
parse_double(c("1000", "$1,000", "10.00"))
# Warning: 1 parsing failure.
# row col expected actual
# 2 -- a double $1,000
#
# [1] 1000 NA 10
# attr(,"problems")
# # A tibble: 1 x 4
# row col expected actual
# <int> <int> <chr> <chr>
# 1 2 NA a double $1,000
- 解析字符串
charToRaw("Hadley")
#> [1] 48 61 64 6c 65 79
The mapping from hexadecimal number to character is called the encoding
给乱码重新编码
x1 <- "El Ni\xf1o was particularly bad this year"
x2 <- "\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd"
parse_character(x1, locale = locale(encoding = "Latin1"))
#> [1] "El Niño was particularly bad this year"
parse_character(x2, locale = locale(encoding = "Shift-JIS"))
#> [1] "こんにちは"
- 猜编码:正确率有限,文本量大时正确率高
guess_encoding(charToRaw(x1))
#> # A tibble: 2 x 2
#> encoding confidence
#> <chr> <dbl>
#> 1 ISO-8859-1 0.46
#> 2 ISO-8859-9 0.23
guess_encoding(charToRaw(x2))
#> # A tibble: 1 x 2
#> encoding confidence
#> <chr> <dbl>
#> 1 KOI8-R 0.42
# 猜错了,实际应该是Shift-JIS
- 解析因子
fruit <- c("apple", "banana")
parse_factor(c("apple", "banana", "bananana"), levels = fruit)
#> Warning: 1 parsing failure.
#> row col expected actual
#> 3 -- value in level set bananana
#> [1] apple banana <NA>
#> attr(,"problems")
#> # A tibble: 1 x 4
#> row col expected actual
#> <int> <int> <chr> <chr>
#> 1 3 NA value in level set bananana
#> Levels: apple banana
- 解析日期时间
- 基础用法
parse_datetime("2010-10-01T2010")
#> [1] "2010-10-01 20:10:00 UTC"
# If time is omitted, it will be set to midnight
parse_datetime("20101010")
#> [1] "2010-10-10 UTC"
parse_date("2010-10-01")
#> [1] "2010-10-01"
library(hms)
parse_time("01:10 am")
#> 01:10:00
parse_time("20:10:01")
#> 20:10:01
- date-time format
- 常用格式
- 可以用通配符跳过
parse_time("现在的时间是:20点10分!以及01秒",
'%*%H%*%M%*%S%*')
# 20:10:01
- 指定不同的格式会出现不同的结果
parse_date("01/02/15", "%m/%d/%y")
#> [1] "2015-01-02"
parse_date("01/02/15", "%d/%m/%y")
#> [1] "2015-02-01"
parse_date("01/02/15", "%y/%m/%d")
#> [1] "2001-02-15"
- locale封装时间格式
# 封装
new_locale <- locale(date_format = "%d/%m/%Y")
parse_date("04/11/2020", locale = new_locale)
# [1] "2020-11-04"
- Exercises
d1 <- "January 1, 2010"
parse_date(d1, '%B %d, %Y')
# [1] "2010-01-01"
d2 <- "2015-Mar-07"
parse_date(d2, '%Y-%b-%d')
# [1] "2015-03-07"
d3 <- "06-Jun-2017"
parse_date(d3, '%d-%b-%Y')
# [1] "2017-06-06"
d4 <- c("August 19 (2015)", "July 1 (2015)")
parse_date(d4, '%B %d (%Y)')
# [1] "2015-08-19" "2015-07-01"
d5 <- "12/30/14" # Dec 30, 2014
parse_date(d5, '%M/%d/%y')
# [1] "2014-01-30"
t1 <- "1705"
parse_time(t1, '%H%M')
# 17:05:00
t2 <- "11:15:10.12 PM"
parse_time(t2, '%I:%M:%OS %p')
# 23:15:10.12
解析导入的文件各列
- 经典解析策略
readr uses a heuristic to figure out the type of each column: it reads the first 1000 rows and uses some (moderately conservative) heuristics to figure out the type of each column.
guess_parser("2010-10-01")
#> [1] "date"
guess_parser("15:01")
#> [1] "time"
guess_parser(c("TRUE", "FALSE"))
#> [1] "logical"
guess_parser(c("1", "5", "9"))
#> [1] "double"
guess_parser(c("12,352,561"))
#> [1] "number"
str(parse_guess("2010-10-10"))
#> Date[1:1], format: "2010-10-10
- 具体解析策略
The heuristic tries each of the following types, stopping when it finds a match:
- logical: contains only “F”, “T”, “FALSE”, or “TRUE”.
- integer: contains only numeric characters (and -).
- double: contains only valid doubles (including numbers like 4.5e-5).
- number: contains valid doubles with the grouping mark inside.
- time: matches the default time_format.
- date: matches the default date_format.
- date-time: any ISO8601 date.
- If none of these rules apply, then the column will stay as a vector of strings.
只通过前1000列启发式解析会带来的问题:
- The first thousand rows might be a special case, and readr guesses a type that is not sufficiently general. For example, you might have a column of doubles that only contains integers in the first 1000 rows.
- The column might contain a lot of missing values. If the first 1000 rows contain only NAs, readr will guess that it’s a logical vector, whereas you probably want to parse it as something more specific.
这里的解析再一次说明了NA是逻辑型:
readr contains a challenging CSV that illustrates both of these problems
challenge <- read_csv(readr_example("challenge.csv"))
#> Parsed with column specification:
#> cols(
#> x = col_double(),
#> y = col_logical()
#> )
#> Warning: 1000 parsing failures.
#> row col expected actual file
#> 1001 y 1/0/T/F/TRUE/FALSE 2015-01-16 '/home/travis/R/Library/readr/extdata/challenge.csv'
#> 1002 y 1/0/T/F/TRUE/FALSE 2018-05-18 '/home/travis/R/Library/readr/extdata/challenge.csv'
#> 1003 y 1/0/T/F/TRUE/FALSE 2015-09-05 '/home/travis/R/Library/readr/extdata/challenge.csv'
#> 1004 y 1/0/T/F/TRUE/FALSE 2012-11-28 '/home/travis/R/Library/readr/extdata/challenge.csv'
#> 1005 y 1/0/T/F/TRUE/FALSE 2020-01-13 '/home/travis/R/Library/readr/extdata/challenge.csv'
#> .... ... .................. .......... ....................................................
#> See problems(...) for more details.
# 使用problem详细捕获异常
problems(challenge)
#> # A tibble: 1,000 x 5
#> row col expected actual file
#> <int> <chr> <chr> <chr> <chr>
#> 1 1001 y 1/0/T/F/TRUE/FA… 2015-01-… '/home/travis/R/Library/readr/extdata/…
#> 2 1002 y 1/0/T/F/TRUE/FA… 2018-05-… '/home/travis/R/Library/readr/extdata/…
#> 3 1003 y 1/0/T/F/TRUE/FA… 2015-09-… '/home/travis/R/Library/readr/extdata/…
#> 4 1004 y 1/0/T/F/TRUE/FA… 2012-11-… '/home/travis/R/Library/readr/extdata/…
#> 5 1005 y 1/0/T/F/TRUE/FA… 2020-01-… '/home/travis/R/Library/readr/extdata/…
#> 6 1006 y 1/0/T/F/TRUE/FA… 2016-04-… '/home/travis/R/Library/readr/extdata/…
#> # … with 994 more rows
- 非字符型的解析:指定解析类型
tail(challenge)
#> # A tibble: 6 x 2
#> x y
#> <dbl> <lgl>
#> 1 0.805 NA
#> 2 0.164 NA
#> 3 0.472 NA
#> 4 0.718 NA
#> 5 0.270 NA
#> 6 0.608 NA
challenge <- read_csv(
readr_example("challenge.csv"),
col_types = cols(
x = col_double(),
y = col_date()
)
)
tail(challenge)
#> # A tibble: 6 x 2
#> x y
#> <dbl> <date>
#> 1 0.805 2019-11-21
#> 2 0.164 2018-03-29
#> 3 0.472 2014-08-04
#> 4 0.718 2015-08-16
#> 5 0.270 2020-02-04
#> 6 0.608 2019-01-06
Every
parse_xyz()
function has a correspondingcol_xyz()
function. You useparse_xyz()
when the data is in a character vector in R already; you usecol_xyz()
when you want to tell readr how to load the data.
- stop_for_problems:严格的捕获错误
遇到error就会终止,warning不会
stop_for_problems(
x <- parse_double("It cost $123.45")
)
# 错误: 1 parsing failure
- 其他解析策略
- 修改默认启发解析的行数:提高解析成功的准确度
challenge2 <- read_csv(readr_example("challenge.csv"), guess_max = 1001)
#> Parsed with column specification:
#> cols(
#> x = col_double(),
#> y = col_date(format = "")
#> )
- 指定默认解析结局的类型
# 默认全部列均解析为character
challenge2 <- read_csv(readr_example("challenge.csv"),
col_types = cols(.default = col_character())
)
- type_convert启发式转换类型
library(tibble)
df <- tribble(
~x, ~y,
#--/----
"1", "1.21",
"2", "2.32",
"3", "4.56"
)
str(df)
# tibble [3 x 2] (S3: tbl_df/tbl/data.frame)
# $ x: chr [1:3] "1" "2" "3"
# $ y: chr [1:3] "1.21" "2.32" "4.56"
df
## A tibble: 3 x 2
# x y
# <chr> <chr>
# 1 1 1.21
# 2 2 2.32
# 3 3 4.56
df <- type_convert(df)
str(df)
# tibble [3 x 2] (S3: tbl_df/tbl/data.frame)
# $ x: num [1:3] 1 2 3
# $ y: num [1:3] 1.21 2.32 4.56
df
## A tibble: 3 x 2
# x y
# <dbl> <dbl>
# 1 1 1.21
# 2 2 2.32
# 3 3 4.56
写入文件
readr also comes with two useful functions for writing data back to disk:
write_csv()
andwrite_tsv()
.
Both functions increase the chances of the output file being read back in correctly by:
- Always encoding strings in UTF-8.
- Saving dates and date-times in ISO8601 format so they are easily parsed elsewhere.
If you want to export a csv file to Excel, use
write_excel_csv()
— this writes a special character (a “byte order mark”) at the start of the file which tells Excel that you’re using the UTF-8 encoding.
write_csv(x, path, na = "NA", append = FALSE, col_names = !append,
quote_escape = "double")
# x 文件,path 写入路径,na 将指定识别为缺失值
# col_names 是否将第一行视为列名,append 是否追加写入
- 需要注意的细节
- 写入csv后列属性会丢失
challenge
#> # A tibble: 2,000 x 2
#> x y
#> <dbl> <date>
#> 1 404 NA
#> 2 4172 NA
#> 3 3004 NA
#> 4 787 NA
#> 5 37 NA
#> 6 2332 NA
#> # … with 1,994 more rows
write_csv(challenge, "challenge-2.csv")
read_csv("challenge-2.csv")
#> Parsed with column specification:
#> cols(
#> x = col_double(),
#> y = col_logical()
#> )
#> # A tibble: 2,000 x 2
#> x y
#> <dbl> <lgl>
#> 1 404 NA
#> 2 4172 NA
#> 3 3004 NA
#> 4 787 NA
#> 5 37 NA
#> 6 2332 NA
#> # … with 1,994 more rows
解决办法:
- 以rds形式写入
write_rds(challenge, "challenge.rds")
read_rds("challenge.rds")
#> # A tibble: 2,000 x 2
#> x y
#> <dbl> <date>
#> 1 404 NA
#> 2 4172 NA
#> 3 3004 NA
#> 4 787 NA
#> 5 37 NA
#> 6 2332 NA
#> # … with 1,994 more rows
- feather包
library(feather)
write_feather(challenge, "challenge.feather")
read_feather("challenge.feather")
#> # A tibble: 2,000 x 2
#> x y
#> <dbl> <date>
#> 1 404 <NA>
#> 2 4172 <NA>
#> 3 3004 <NA>
#> 4 787 <NA>
#> 5 37 <NA>
#> 6 2332 <NA>
#> # ... with 1,994 more rows
- Feather tends to be faster than RDS and is usable outside of R.
- RDS supports list-columns; feather currently does not.