基础图形可视化
数据分析的图形可视化是了解数据分布、波动和相关性等属性必不可少的手段。不同的图形类型对数据属性的表征各不相同,通常具体问题使用具体的可视化图形。R语言在可视化方面具有极大的优势,因其本身就是统计学家为了研究统计问题开发的编程语言,因此极力推荐使用R语言可视化数据。
图形类型及其使用意义
散点图
散点图是由x值和y值确定的点散乱分布在坐标轴上,一是可以用来展示数据的分布和聚合情况,二是可通过分布情况得到x和y之间的趋势结论。多用于回归分析,发现自变量和因变量的变化趋势,进而选择合适的函数对数据点进行拟合。
library(ggplot2)
library(dplyr)
dat <- %>% mutate(cyl = factor(cyl))
ggplot(dat, aes(x = wt, y = mpg, shape = cyl, color = cyl)) +
geom_point(size = 3, alpha = 0.4) +
geom_smooth(method = lm, linetype = "dashed",
color = "darkred", fill = "blue") +
geom_text(aes(label = rownames(dat)), size = 4) +
theme_bw(base_size = 12) +
theme(plot.title = element_text(size = 10, color = "black", face = "bold", hjust = 0.5),
axis.title = element_text(size = 10, color = "black", face = "bold"),
axis.text = element_text(size = 9, color = "black"),
axis.ticks.length = unit(-0.05, "in"),
axis.text.y = element_text(margin = unit(c(0.3, 0.3,
0.3, 0.3), "cm"), size = 9),
axis.text.x = element_blank(),
text = element_text(size = 8, color = "black"),
strip.text = element_text(size = 9, color = "black", face = "bold"),
panel.grid = element_blank())
直方图
直方图是一种对数据分布情况进行可视化的图形,它是二维统计图表,对应两个坐标分别是统计样本以及该样本对应的某个属性如频率等度量。
library(ggplot2)
data <- data.frame(
Conpany = c("Apple", "Google", "Facebook", "Amozon", "Tencent"),
Sale2013 = c(5000, 3500, 2300, 2100, 3100),
Sale2014 = c(5050, 3800, 2900, 2500, 3300),
Sale2015 = c(5050, 3800, 2900, 2500, 3300),
Sale2016 = c(5050, 3800, 2900, 2500, 3300))
mydata <- tidyr::gather(data, Year, Sale, -Conpany)
ggplot(mydata, aes(Conpany, Sale, fill = Year)) +
geom_bar(stat = "identity", position = "dodge") +
guides(fill = guide_legend(title = NULL)) +
ggtitle("The Financial Performance of Five Giant") +
scale_fill_wsj("rgby", "") +
theme_wsj() +
theme(
axis.ticks.length = unit(0.5, "cm"),
axis.title = element_blank()))
library(patternplot)
data <- read.csv(system.file("extdata", "monthlyexp.csv",
package = "patternplot"))
data <- data[which(data$City == "City 1"), ]
x <- factor(data$Type, c("Housing", "Food", "Childcare"))
y <- data$Monthly_Expenses
pattern.type <- c("hdashes", "blank", "crosshatch")
pattern.color <- c("black", "black", "black")
background.color <- c("white", "white", "white")
density <- c(20, 20, 10)
patternplot::patternbar(data, x, y, group = NULL,
ylab = "Monthly Expenses, Dollar",
pattern.type = pattern.type,
pattern.color = pattern.color,
background.color = background.color,
pattern.line.size = 0.5,
frame.color = c("black", "black", "black"), density = density) +
ggtitle("(A) Black and White with Patterns"))
箱线图
箱线图是一种显示一组数据分布情况的统计图,它形状像箱子因此被也被称为箱形图。它通过六个数据节点将一组数据从大到小排列(上极限到下极限),反应原始数据分布特征。意义在于发现关键数据如平均值、任何异常值、数据分布紧密度和偏分布等。
library(ggplot2)
library(dplyr)
pr <- unique(dat$Fruit)
grp.col <- c("#999999", "#E69F00", "#56B4E9")
dat %>% mutate(Fruit = factor(Fruit)) %>%
ggplot(aes(x = Fruit, y = Weight, color = Fruit)) +
stat_boxplot(geom = "errorbar", width = 0.15) +
geom_boxplot(aes(fill = Fruit), width = 0.4, outlier.colour = "black", outlier.shape = 21, outlier.size = 1) +
stat_summary(fun.y = mean, geom = "point", shape = 16,
size = 2, color = "black") +
# 在顶部显示每组的数目
stat_summary(fun.data = function(x) {
return(data.frame(y = 0.98 * 120, label = length(x)))
}, geom = "text", hjust = 0.5, color = "red", size = 6) +
stat_compare_means(comparisons = list(
c(pr[1], pr[2]), c(pr[1], pr[3]), c(pr[2], pr[3])),
label = "p.signif", method = "wilcox.test") +
labs(title = "Weight of Fruit", x = "Fruit", y = "Weight (kg)") +
scale_color_manual(values = grp.col, labels = pr) +
scale_fill_manual(values = grp.col, labels = pr) +
guides(color = F, fil = F) +
scale_y_continuous(sec.axis = dup_axis(
label = NULL, name = NULL),
breaks = seq(90, 108, 2), limits = c(90, 120)) +
theme_bw(base_size = 12) +
theme(plot.title = element_text(size = 10, color = "black",
face = "bold", hjust = 0.5),
axis.title = element_text(size = 10,
color = "black", face = "bold"),
axis.text = element_text(size = 9, color = "black"),
axis.ticks.length = unit(-0.05, "in"),
axis.text.y = element_text(margin = unit(c(0.3, 0.3,
0.3, 0.3), "cm"), size = 9),
axis.text.x = element_text(margin = unit(c(0.3,
0.3, 0.3, 0.3), "cm")),
text = element_text(size = 8, color = "black"),
strip.text = element_text(size = 9, color = "black", face = "bold"),
panel.grid = element_blank())
面积图
面积图是一种展示个体与整体的关系的统计图,更多用于时间序列变化的研究。
library(ggplot2)
library(dplyr)
dat %>% group_by(Fruit, Store) %>%
summarize(mean_Weight = mean(Weight)) %>%
ggplot(aes(x = Store, group = Fruit)) +
geom_area(aes(y = mean_Weight,
fill = as.factor(Fruit)), position = "stack", linetype = "dashed") +
geom_hline(aes(yintercept = mean(mean_Weight)), color = "blue",
linetype = "dashed", size = 1) +
guides(fill = guide_legend(title = NULL)) +
theme_bw(base_size = 12) +
theme(plot.title = element_text(size = 10,
color = "black", face = "bold", hjust = 0.5),
axis.title = element_text(size = 10,
color = "black", face = "bold"),
axis.text = element_text(size = 9, color = "black"),
axis.ticks.length = unit(-0.05, "in"),
axis.text.y = element_text(margin = unit(c(0.3, 0.3,
0.3, 0.3), "cm"), size = 9),
axis.text.x = element_text(margin = unit(c(0.3,
0.3, 0.3, 0.3), "cm")),
text = element_text(size = 8, color = "black"),
strip.text = element_text(size = 9,
color = "black", face = "bold"),
panel.grid = element_blank())
热图
热图也是一种对数据分布情况可视化的统计图形,如下图表现得是数据差异性的具象化实例。一般用于样本聚类等可视化过程。在基因表达或者丰度表达差异研究中,热图既可以展现数据质量间的差异性,也可以用于聚类等。
library(ggplot2)
data <- as.data.frame(matrix(rnorm(9 * 10), 9, 10))
rownames(data) <- paste("Gene", 1:9, sep = "_")
colnames(data) <- paste("sample", 1:10, sep = "_")
data$ID <- rownames(data)
data_m <- tidyr::gather(data, sampleID, value, -ID)
ggplot(data_m, aes(x = sampleID, y = ID)) +
geom_tile(aes(fill = value)) +
scale_fill_gradient2("Expression", low = "green", high = "red",
mid = "black") +
xlab("samples") +
theme_classic() +
theme(axis.ticks = element_blank(),
axis.line = element_blank(),
panel.grid.major = element_blank(),
legend.key = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
legend.position = "top")
相关图
相关图是热图的一种特殊形式,展示的是样本间相关系数大小的热图。
library(corrplot)
corrplot(corr = cor(dat[1:7]), order = "AOE", type = "upper", tl.pos = "d")
corrplot(corr = cor(dat[1:7]), add = TRUE, type = "lower", method = "number",
order = "AOE", diag = FALSE, tl.pos = "n", cl.pos = "n")
折线图
折线图是反应数据分布趋势的可视化图形,其本质和堆积图或者说面积图有些相似。
library(ggplot2)
library(dplyr)
grp.col <- c("#999999", "#E69F00", "#56B4E9")
dat.cln <- sampling::strata(dat, stratanames = "Fruit",
size = rep(round(nrow(dat) * 0.1/3, -1), 3), method = "srswor")
dat %>% slice(dat.cln$ID_unit) %>%
mutate(Year = as.character(rep(1996:2015, times = 3))) %>%
mutate(Year = factor(as.character(Year))) %>%
ggplot(aes(x = Year, y = Weight, linetype = Fruit, colour = Fruit,
shape = Fruit, fill = Fruit)) +
geom_line(aes(group = Fruit)) +
geom_point() +
scale_linetype_manual(values = c(1:3)) +
scale_shape_manual(values = c(19, 21, 23)) +
scale_color_manual(values = grp.col,
labels = pr) +
scale_fill_manual(values = grp.col, labels = pr) +
theme_bw() +
theme(plot.title = element_text(size = 10,
color = "black", face = "bold", hjust = 0.5),
axis.title = element_text(size = 10, color = "black", face = "bold"),
axis.text = element_text(size = 9, color = "black"),
axis.ticks.length = unit(-0.05, "in"),
axis.text.y = element_text(margin = unit(c(0.3, 0.3,
0.3, 0.3), "cm"), size = 9),
axis.text.x = element_text(margin = unit(c(0.3,
0.3, 0.3, 0.3), "cm")),
text = element_text(size = 8, color = "black"),
strip.text = element_text(size = 9, color = "black", face = "bold"), panel.grid = element_blank())
韦恩图
韦恩图是一种展示不同分组之间集合重叠区域的可视化图。
library(VennDiagram)
A <- sample(LETTERS, 18, replace = FALSE)
B <- sample(LETTERS, 18, replace = FALSE)
C <- sample(LETTERS, 18, replace = FALSE)
D <- sample(LETTERS, 18, replace = FALSE)
venn.diagram(x = list(A = A, D = D, B = B, C = C),
filename = "Group4.png", height = 450, width = 450,
resolution = 300, imagetype = "png", col = "transparent",
fill = c("cornflowerblue", "green", "yellow", "darkorchid1"),
alpha = 0.5, cex = 0.45, cat.cex = 0.45)
library(ggplot2)
library(UpSetR)
movies <- read.csv(system.file("extdata", "movies.csv",
package = "UpSetR"), header = T, sep = ";")
mutations <- read.csv(system.file("extdata", "mutations.csv",
package = "UpSetR"), header = T, sep = ",")
another.plot <- function(data, x, y) {
round_any_new <- function(x, accuracy, f = round) {
f(x/accuracy) * accuracy
}
data$decades <- round_any_new(as.integer(unlist(data[y])), 10, ceiling)
data <- data[which(data$decades >= 1970), ]
myplot <- (ggplot(data, aes_string(x = x)) +
geom_density(aes(fill = factor(decades)), alpha = 0.4) +
theme_bw() +
theme(plot.margin = unit(c(0, 0, 0, 0), "cm"),
legend.key.size = unit(0.4, "cm")))
}
upset(movies, main.bar.color = "black",
mb.ratio = c(0.5, 0.5),
queries = list(list(query = intersects, params = list("Drama"),
color = "red", active = F),
list(query = intersects, params = list("Action", "Drama"), active = T),
list(query = intersects, params = list("Drama", "Comedy", "Action"),
color = "orange",active = T)),
attribute.plots = list(gridrows = 50,
plots = list(list(plot = histogram, x = "ReleaseDate", queries = F),
list(plot = scatter_plot, x = "ReleaseDate",
y = "AvgRating", queries = T),
list(plot = another.plot,x = "AvgRating", y = "ReleaseDate",
queries = F)),
ncols = 3)))
火山图
火山图通过两个属性Fold change和P value反应两组数据的差异性。
library(ggplot2)
data <- read.table(choose.files(),header = TRUE)
data$color <- ifelse(data$padj<0.05 & abs(data$log2FoldChange)>= 1,
ifelse(data$log2FoldChange > 1,'red','blue'),'gray')
color <- c(red = "red",gray = "gray",blue = "blue")
ggplot(data, aes(log2FoldChange, -log10(padj), col = color)) +
geom_point() +
theme_bw() +
scale_color_manual(values = color) +
labs(x="log2 (fold change)",y="-log10 (q-value)") +
geom_hline(yintercept = -log10(0.05), lty=4,col="grey",lwd=0.6) +
geom_vline(xintercept = c(-1, 1), lty=4,col="grey",lwd=0.6) +
theme(legend.position = "none",
panel.grid=element_blank(),
axis.title = element_text(size = 16),
axis.text = element_text(size = 14))
饼图
饼图是用于刻画分组间如频率等属性的相对关系图。
library(patternplot)
data <- read.csv(system.file("extdata", "vegetables.csv",
package = "patternplot"))
pattern.type <- c("hdashes", "vdashes", "bricks")
pattern.color <- c("red3", "green3", "white")
background.color <- c("dodgerblue", "lightpink", "orange")
patternpie(group = data$group, pct = data$pct,
label = data$label, pattern.type = pattern.type,
pattern.color = pattern.color,
background.color = background.color, frame.color = "grey40",
pixel = 0.3, pattern.line.size = 0.3, frame.size = 1.5,
label.size = 5, label.distance = 1.35) +
ggtitle("(B) Colors with Patterns"))
密度曲线图
密度曲线图反应的是数据在不同区间的密度分布情况,和概率密度函数PDF
曲线类似。
library(ggplot2)
library(plyr)
set.seed(1234)
df <- data.frame(
sex=factor(rep(c("F", "M"), each=200)),
weight=round(c(rnorm(200, mean=55, sd=5),
rnorm(200, mean=65, sd=5)))
)
mu <- ddply(df, "sex", summarise, grp.mean=mean(weight))
ggplot(df, aes(x=weight, fill=sex)) +
geom_histogram(aes(y=..density..), alpha=0.5,
position="identity") +
geom_density(alpha=0.4) +
geom_vline(data=mu, aes(xintercept=grp.mean, color=sex),
linetype="dashed") +
scale_color_grey() +
theme_classic()+
theme(legend.position="top")