《R for Data Science》第十五章 factors 啃书知识点积累
参考链接:R for Data Science
Creating factors
x1 <- c("Dec", "Apr", "Jan", "Mar")
纯粹创建一个向量记录月份,有两个缺点:
- 没有很好的办法避免打字错误
x2 <- c("Dec", "Apr", "Jam", "Mar")
- 排序只能按照首字母顺序
sort(x1)
#> [1] "Apr" "Dec" "Jan" "Mar"
策略:创建factor
首先创建levels
month_levels <- c(
"Jan", "Feb", "Mar", "Apr", "May", "Jun",
"Jul", "Aug", "Sep", "Oct", "Nov", "Dec"
)
然后创建因子
y1 <- factor(x1, levels = month_levels)
sort(y1)
#> [1] Jan Mar Apr Dec
#> Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
- 如果向量中的值不存在于levels中会被静默转换为NA
可以用readr::parse_factor
捕获warning
x2 <- c("Dec", "Apr", "Jam", "Mar")
y2 <- factor(x2, levels = month_levels)
y2
#> [1] Dec Apr <NA> Mar
#> Levels: Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
y2 <- parse_factor(x2, levels = month_levels)
#> Warning: 1 parsing failure.
#> row col expected actual
#> 3 -- value in level set Jam
- 如果不设定levels,会自动创建按照字母表顺序的levels
factor(x1)
#> [1] Dec Apr Jan Mar
#> Levels: Apr Dec Jan Mar
- 按照分类变量第一次出现的顺序设定levels
方法一:创建时用unique
f1 <- factor(x1, levels = unique(x1))
f1
#> [1] Dec Apr Jan Mar
#> Levels: Dec Apr Jan Mar
方法二:创建后用fct_inorder
f2 <- x1 %>% factor() %>% fct_inorder()
f2
#> [1] Dec Apr Jan Mar
#> Levels: Dec Apr Jan Mar
- ·levels·直接查询因子内部水平
levels(f2)
#> [1] "Dec" "Apr" "Jan" "Mar"
General Social Survey
??forcats::gss_cat
- 分类变量映射ggplot2的x轴
会自动转factor并且删除没有任何值的级别,可以用drop=FALSE
强迫显示
library(ggplot2)
library(patchwork)
p1 <- ggplot(gss_cat, aes(race)) +
geom_bar()
p2 <- ggplot(gss_cat, aes(race)) +
geom_bar() +
scale_x_discrete(drop = FALSE)
p1 + p2
- Exercises
gss_cat %>%
# 过滤掉符合条件的
filter(!rincome %in% c("Not applicable")) %>%
# 修改变量中某一亚群的名字
mutate(rincome = fct_recode(rincome,
"Less than $1000" = "Lt $1000"
)) %>%
# 区别填充色的预处理
mutate(rincome_na = rincome %in% c("Refused", "Don't know", "No answer")) %>%
ggplot(aes(x = rincome, fill = rincome_na)) +
geom_bar() +
coord_flip() +
scale_y_continuous("Number of Respondents", labels = scales::comma) +
scale_x_discrete("Respondent's Income") +
# 区别填充
scale_fill_manual(values = c("FALSE" = "black", "TRUE" = "gray")) +
theme(legend.position = "None")
Modifying factor order
It’s often useful to change the order of the factor levels in a visualisation.
- 依数值重排序 fct_reorder
relig_summary <- gss_cat %>%
group_by(relig) %>%
summarise(
age = mean(age, na.rm = TRUE),
tvhours = mean(tvhours, na.rm = TRUE),
n = n()
)
p1 <- ggplot(relig_summary, aes(tvhours, relig)) +
geom_point()
# 默认降序
p2 <- ggplot(relig_summary, aes(tvhours, fct_reorder(relig, tvhours))) +
geom_point()
# 也可以用EDA提到的reorder
ggplot(relig_summary, aes(tvhours, reorder(relig, tvhours))) +
geom_point()
p1 + p2
- 自定义重排序 fct_relevel
It takes a factor,
f
, and then any number of levels that you want to move to the front of the line.
rincome_summary <- gss_cat %>%
group_by(rincome) %>%
summarise(
age = mean(age, na.rm = TRUE),
tvhours = mean(tvhours, na.rm = TRUE),
n = n()
)
p1 <- ggplot(rincome_summary, aes(age, rincome)) +
geom_point()
p2 <- ggplot(rincome_summary, aes(age, fct_relevel(rincome, "Not applicable"))) +
geom_point()
p1 + p2
- 调节图例顺序 fct_reorder2()
fct_reorder2()
reorders the factor by they
values associated with the largestx
values. This makes the plot easier to read because the line colours line up with the legend.
主要作用是调节图例顺序便于阅读
by_age <- gss_cat %>%
filter(!is.na(age)) %>%
count(age, marital) %>%
group_by(age) %>%
mutate(prop = n / sum(n))
ggplot(by_age, aes(age, prop, colour = marital)) +
geom_line(na.rm = TRUE)
ggplot(by_age, aes(age, prop, colour = fct_reorder2(marital, age, prop))) +
geom_line() +
labs(colour = "marital")
- 另一个例子:各党派每年比例的变化
p1 <- gss_cat %>%
mutate(
partyid =
fct_collapse(
partyid,
Others = c("No answer", "Don't know", "Other party"),
Republican = c("Strong republican", "Not str republican"),
Independent = c("Ind,near rep", "Independent", "Ind,near dem"),
Democrat = c("Not str democrat", "Strong democrat")
)
) %>%
count(year, partyid) %>%
group_by(year) %>%
mutate(proportions = n / sum(n)) %>%
ggplot(aes(year, proportions,
colour = partyid
)) +
geom_point() +
geom_line(size = 1)
p2 <- gss_cat %>%
mutate(
partyid =
fct_collapse(
partyid,
Others = c("No answer", "Don't know", "Other party"),
Republican = c("Strong republican", "Not str republican"),
Independent = c("Ind,near rep", "Independent", "Ind,near dem"),
Democrat = c("Not str democrat", "Strong democrat")
)
) %>%
count(year, partyid) %>%
group_by(year) %>%
mutate(proportions = n / sum(n)) %>%
ggplot(aes(year, proportions,
colour = fct_reorder2(partyid, year, proportions)
)) +
geom_point() +
geom_line(size = 1) +
labs(colour = "Party ID")
p1 + p2
- 柱形图的简易重排
利用fct_infreq()
和fct_rev()
# 调节为顺序递增
p1 <- gss_cat %>%
mutate(marital = marital %>% fct_infreq()) %>%
ggplot(aes(marital)) +
geom_bar()
# 配合fct_rev是顺序递减
p2 <- gss_cat %>%
mutate(marital = marital %>% fct_infreq() %>% fct_rev()) %>%
ggplot(aes(marital)) +
geom_bar()
p1 + p2
- 判断一个dataset哪些变量是factor
str(gss_cat)
# 或者有更简便的办法
keep(gss_cat,is.factor) %>%
names(.)
# [1] "marital" "race" "rincome" "partyid" "relig" "denom"
Modifying factor levels
More powerful than changing the orders of the levels is changing their values.
- 修改变量中的值 fct_recode()
gss_cat %>%
count(partyid)
#> # A tibble: 10 x 2
#> partyid n
#> <fct> <int>
#> 1 No answer 154
#> 2 Don't know 1
#> 3 Other party 393
#> 4 Strong republican 2314
#> 5 Not str republican 3032
#> 6 Ind,near rep 1791
#> # … with 4 more rows
gss_cat %>%
mutate(partyid = fct_recode(
partyid,
"Republican, strong" = "Strong republican",
"Republican, weak" = "Not str republican",
"Independent, near rep" = "Ind,near rep",
"Independent, near dem" = "Ind,near dem",
"Democrat, weak" = "Not str democrat",
"Democrat, strong" = "Strong democrat"
)) %>%
count(partyid)
#> # A tibble: 10 x 2
#> partyid n
#> <fct> <int>
#> 1 No answer 154
#> 2 Don't know 1
#> 3 Other party 393
#> 4 Republican, strong 2314
#> 5 Republican, weak 3032
#> 6 Independent, near rep 1791
#> # … with 4 more rows
fct_recode()
will leave levels that aren’t explicitly mentioned as is, and will warn you if you accidentally refer to a level that doesn’t exist.
- 可以将多个不同值整合为同一种便于分组
gss_cat %>%
mutate(partyid = fct_recode(partyid,
"Republican, strong" = "Strong republican",
"Republican, weak" = "Not str republican",
"Independent, near rep" = "Ind,near rep",
"Independent, near dem" = "Ind,near dem",
"Democrat, weak" = "Not str democrat",
"Democrat, strong" = "Strong democrat",
"Other" = "No answer",
"Other" = "Don't know",
"Other" = "Other party"
)) %>%
count(partyid)
- 同时整合多个值 fct_collapse()
gss_cat %>%
mutate(partyid = fct_collapse(partyid,
other = c("No answer", "Don't know", "Other party"),
rep = c("Strong republican", "Not str republican"),
ind = c("Ind,near rep", "Independent", "Ind,near dem"),
dem = c("Not str democrat", "Strong democrat")
)) %>%
count(partyid)
#> # A tibble: 4 x 2
#> partyid n
#> <fct> <int>
#> 1 other 548
#> 2 rep 5346
#> 3 ind 8409
#> 4 dem 7180
- 放一个案例:整合收入数据可视化
gss_cat %>%
mutate(
rincome =
fct_collapse(
rincome,
`Unknown` = c("No answer", "Don't know", "Refused", "Not applicable"),
`Less than $5000` = c("Lt $1000", str_c(
"$", c("1000", "3000", "4000"),
" to ", c("2999", "3999", "4999")
)),
`$5000 to 10000` = str_c(
"$", c("5000", "6000", "7000", "8000"),
" to ", c("5999", "6999", "7999", "9999")
)
)
) %>%
ggplot(aes(x = rincome)) +
geom_bar() +
coord_flip()
- 自动堆砌值,多值化少值 fct_lump()
整合方式是从最少堆开始逐渐向上吞并
一般用于无序数据的整合
gss_cat %>%
mutate(relig = fct_lump(relig)) %>%
count(relig)
#> # A tibble: 2 x 2
#> relig n
#> <fct> <int>
#> 1 Protestant 10846
#> 2 Other 10637
- 可以用参数
n
控制最后整合成的堆数
gss_cat %>%
mutate(relig = fct_lump(relig, n = 10)) %>%
count(relig, sort = TRUE) %>%
print(n = Inf)
#> # A tibble: 10 x 2
#> relig n
#> <fct> <int>
#> 1 Protestant 10846
#> 2 Catholic 5124
#> 3 None 3523
#> 4 Christian 689
#> 5 Other 458
#> 6 Jewish 388
#> 7 Buddhism 147
#> 8 Inter-nondenominational 109
#> 9 Moslem/islam 104
#> 10 Orthodox-christian 95
gss_cat %>%
mutate(relig = fct_lump(relig, n = 5)) %>%
count(relig, sort = TRUE)
# # A tibble: 6 x 2
# relig n
# <fct> <int>
# 1 Protestant 10846
# 2 Catholic 5124
# 3 None 3523
# 4 Other 913
# 5 Christian 689
# 6 Jewish 388