什么是探索性数据分析
参看之前的文章:
数量生态学笔记||数据探索
环境与生态统计||探索性数据分析
环境与生态统计||探索性数据可视化
探索性数据分析的作用
- 对数据提出问题
- 对数据进行可视化、转换、建模,进而找出问题的答案
- 使用上一步的结果来精炼问题,并提出新问题
对分布进行可视化
head(diamonds)
# A tibble: 6 x 10
carat cut color clarity depth table price x y z
<dbl> <ord> <ord> <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
4 0.290 Premium I VS2 62.4 58 334 4.2 4.23 2.63
5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut))
> diamonds %>%
+ count(cut)
# A tibble: 5 x 2
cut n
<ord> <int>
1 Fair 1610
2 Good 4906
3 Very Good 12082
4 Premium 13791
5 Ideal 21551
ggplot(data = diamonds) +
geom_histogram(mapping = aes(x = carat), binwidth = 0.5)
> diamonds %>%
+ count(cut_width(carat, 0.5))
# A tibble: 11 x 2
`cut_width(carat, 0.5)` n
<fct> <int>
1 [-0.25,0.25] 785
2 (0.25,0.75] 29498
3 (0.75,1.25] 15977
4 (1.25,1.75] 5313
5 (1.75,2.25] 2002
6 (2.25,2.75] 322
7 (2.75,3.25] 32
8 (3.25,3.75] 5
9 (3.75,4.25] 4
10 (4.25,4.75] 1
11 (4.75,5.25] 1
diamonds %>%
filter(carat < 3) %>%
ggplot( mapping = aes(x = carat)) +
geom_histogram(binwidth = 0.1)
diamonds %>%
filter(carat < 3) %>%
ggplot( mapping = aes(x = carat, colour = cut)) +
geom_freqpoly(binwidth = 0.1)
典型值
diamonds %>%
filter(carat < 3) %>%
ggplot( mapping = aes(x = carat)) +
geom_histogram(binwidth = 0.01)
异常值
p1<- ggplot(diamonds) +
geom_histogram(mapping = aes(x = y), binwidth = 0.5)
p2<-ggplot(diamonds) +
geom_histogram(mapping = aes(x = y), binwidth = 0.5) +
coord_cartesian(ylim = c(0, 50))
library(gridExtra)
grid.arrange(p1,p2,ncol = 2, nrow = 1)
unusual <- diamonds %>%
filter(y < 3 | y > 20) %>%
select(price, x, y, z) %>%
arrange(y)
unusual
# A tibble: 9 x 4
price x y z
<int> <dbl> <dbl> <dbl>
1 5139 0 0 0
2 6381 0 0 0
3 12800 0 0 0
4 15686 0 0 0
5 18034 0 0 0
6 2130 0 0 0
7 2130 0 0 0
8 2075 5.15 31.8 5.12
9 12210 8.09 58.9 8.06
缺失值
- 去丢弃异常值
diamonds2 <- diamonds %>%
filter(between(y, 3, 20))
建议用缺失值代替异常值
diamonds2 <- diamonds %>%
mutate(y = ifelse(y < 3 | y > 20, NA, y))
p1<- ggplot(data = diamonds2, mapping = aes(x = x, y = y)) +
geom_point()
Warning message:
Removed 9 rows containing missing values (geom_point).
p2<-ggplot(data = diamonds2, mapping = aes(x = x, y = y)) +
geom_point(na.rm = TRUE)
grid.arrange(p1,p2,ncol = 2, nrow = 1)
nycflights13::flights %>%
mutate(
cancelled = is.na(dep_time),
sched_hour = sched_dep_time %/% 100,
sched_min = sched_dep_time %% 100,
sched_dep_time = sched_hour + sched_min / 60
) %>%
ggplot(mapping = aes(sched_dep_time)) +
geom_freqpoly(mapping = aes(colour = cancelled), binwidth = 1/4)
相关变动
p1<-ggplot(data = diamonds, mapping = aes(x = price)) +
geom_freqpoly(mapping = aes(colour = cut), binwidth = 500)
p2<-ggplot(diamonds) +
geom_bar(mapping = aes(x = cut))
p3<-ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) +
geom_freqpoly(mapping = aes(colour = cut), binwidth = 500)
grid.arrange(p1,p2,p3,ncol = 3, nrow = 1)
p1<-ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
geom_boxplot()
p2<-ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
geom_boxplot()
p3<-ggplot(data = mpg) +
geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy))
p4<-ggplot(data = mpg) +
geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) +
coord_flip()
p5<-ggplot(data = mpg) +
geom_violin(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) +
coord_flip()
grid.arrange(p1,p2,p3,p4,p5,ncol = 5, nrow = 1)
两个分类变量
p1<-ggplot(data = diamonds) +
geom_count(mapping = aes(x = cut, y = color))
p2<- diamonds %>%
count(color, cut) %>%
ggplot(mapping = aes(x = color, y = cut)) +
geom_tile(mapping = aes(fill = n))
diamonds %>%
count(color, cut)
#> # A tibble: 35 x 3
#> color cut n
#> <ord> <ord> <int>
#> 1 D Fair 163
#> 2 D Good 662
#> 3 D Very Good 1513
#> 4 D Premium 1603
#> 5 D Ideal 2834
#> 6 E Fair 224
#> # … with 29 more rows
grid.arrange(p1,p2,ncol = 2, nrow = 1)
两个连续变量
p1<- ggplot(data = diamonds) +
geom_point(mapping = aes(x = carat, y = price))
p2<-ggplot(data = diamonds) +
geom_point(mapping = aes(x = carat, y = price), alpha = 1 / 100)
smaller <- diamonds %>%
filter(carat < 3)
p3<-ggplot(data = smaller) +
geom_bin2d(mapping = aes(x = carat, y = price))
# install.packages("hexbin")
p4<-ggplot(data = smaller) +
geom_hex(mapping = aes(x = carat, y = price))
p5<-ggplot(data = smaller, mapping = aes(x = carat, y = price)) +
geom_boxplot(mapping = aes(group = cut_width(carat, 0.1)))
grid.arrange(p1,p2,p3,p4,p5,ncol = 5, nrow = 1)
模式和模型
- 模式是不是巧合
- 如何描述隐含关系
- 隐含关系有多强
- 其他变量如何影响这种关系
- 独立分组会有变化么
library(modelr)
mod <- lm(log(price) ~ log(carat), data = diamonds)
diamonds2 <- diamonds %>%
add_residuals(mod) %>%
mutate(resid = exp(resid))
p1<-ggplot(data = diamonds2) +
geom_point(mapping = aes(x = carat, y = resid))
p2<-ggplot(data = diamonds2) +
geom_boxplot(mapping = aes(x = cut, y = resid))
grid.arrange(p1,p2,ncol = 2, nrow = 1)
ggplot2 调用
diamonds %>%
count(cut, clarity) %>%
ggplot(aes(clarity, cut, fill = n)) +
geom_tile()