R可视化：基础图形可视化（一）

基础图形可视化

数据分析的图形可视化是了解数据分布、波动和相关性等属性必不可少的手段。不同的图形类型对数据属性的表征各不相同，通常具体问题使用具体的可视化图形。R语言在可视化方面具有极大的优势，因其本身就是统计学家为了研究统计问题开发的编程语言，因此极力推荐使用R语言可视化数据。

图形类型及其使用意义

散点图

散点图是由x值和y值确定的点散乱分布在坐标轴上，一是可以用来展示数据的分布和聚合情况，二是可通过分布情况得到x和y之间的趋势结论。多用于回归分析，发现自变量和因变量的变化趋势，进而选择合适的函数对数据点进行拟合。

library(ggplot2)
library(dplyr)

dat <- %>% mutate(cyl = factor(cyl)) 
ggplot(dat, aes(x = wt, y = mpg, shape = cyl, color = cyl)) + 
    geom_point(size = 3, alpha = 0.4) + 
    geom_smooth(method = lm, linetype = "dashed", 
        color = "darkred", fill = "blue") + 
    geom_text(aes(label = rownames(dat)), size = 4) + 
    theme_bw(base_size = 12) + 
    theme(plot.title = element_text(size = 10, color = "black", face = "bold", hjust = 0.5), 
          axis.title = element_text(size = 10, color = "black", face = "bold"), 
          axis.text = element_text(size = 9, color = "black"), 
          axis.ticks.length = unit(-0.05, "in"), 
          axis.text.y = element_text(margin = unit(c(0.3, 0.3, 
            0.3, 0.3), "cm"), size = 9), 
          axis.text.x = element_blank(), 
          text = element_text(size = 8, color = "black"), 
          strip.text = element_text(size = 9, color = "black", face = "bold"), 
          panel.grid = element_blank())

直方图

直方图是一种对数据分布情况进行可视化的图形，它是二维统计图表，对应两个坐标分别是统计样本以及该样本对应的某个属性如频率等度量。

library(ggplot2)

data <- data.frame(
  Conpany = c("Apple", "Google", "Facebook", "Amozon", "Tencent"), 
  Sale2013 = c(5000, 3500, 2300, 2100, 3100), 
  Sale2014 = c(5050, 3800, 2900, 2500, 3300), 
  Sale2015 = c(5050, 3800, 2900, 2500, 3300), 
  Sale2016 = c(5050, 3800, 2900, 2500, 3300))
mydata <- tidyr::gather(data, Year, Sale, -Conpany)
ggplot(mydata, aes(Conpany, Sale, fill = Year)) + 
    geom_bar(stat = "identity", position = "dodge") +
    guides(fill = guide_legend(title = NULL)) + 
    ggtitle("The Financial Performance of Five Giant") + 
    scale_fill_wsj("rgby", "") + 
    theme_wsj() + 
    theme(
      axis.ticks.length = unit(0.5, "cm"), 
      axis.title = element_blank()))

library(patternplot)

data <- read.csv(system.file("extdata", "monthlyexp.csv", 
        package = "patternplot"))
data <- data[which(data$City == "City 1"), ]
x <- factor(data$Type, c("Housing", "Food", "Childcare"))
y <- data$Monthly_Expenses
pattern.type <- c("hdashes", "blank", "crosshatch")
pattern.color <- c("black", "black", "black")
background.color <- c("white", "white", "white")
density <- c(20, 20, 10)

patternplot::patternbar(data, x, y, group = NULL, 
        ylab = "Monthly Expenses, Dollar", 
        pattern.type = pattern.type, 
        pattern.color = pattern.color,
        background.color = background.color, 
        pattern.line.size = 0.5, 
        frame.color = c("black", "black", "black"), density = density) + 
ggtitle("(A) Black and White with Patterns"))

箱线图

箱线图是一种显示一组数据分布情况的统计图，它形状像箱子因此被也被称为箱形图。它通过六个数据节点将一组数据从大到小排列（上极限到下极限），反应原始数据分布特征。意义在于发现关键数据如平均值、任何异常值、数据分布紧密度和偏分布等。

library(ggplot2)
library(dplyr)

pr <- unique(dat$Fruit)
grp.col <- c("#999999", "#E69F00", "#56B4E9")

dat %>% mutate(Fruit = factor(Fruit)) %>% 
    ggplot(aes(x = Fruit, y = Weight, color = Fruit)) + 
        stat_boxplot(geom = "errorbar", width = 0.15) + 
        geom_boxplot(aes(fill = Fruit), width = 0.4, outlier.colour = "black",                       outlier.shape = 21, outlier.size = 1) + 
        stat_summary(fun.y = mean, geom = "point", shape = 16,
                     size = 2, color = "black") +
        # 在顶部显示每组的数目
        stat_summary(fun.data = function(x) {
            return(data.frame(y = 0.98 * 120, label = length(x)))
            }, geom = "text", hjust = 0.5, color = "red", size = 6) + 
        stat_compare_means(comparisons = list(
            c(pr[1], pr[2]), c(pr[1], pr[3]), c(pr[2], pr[3])),
            label = "p.signif", method = "wilcox.test") + 
        labs(title = "Weight of Fruit", x = "Fruit", y = "Weight (kg)") +
        scale_color_manual(values = grp.col, labels = pr) +
        scale_fill_manual(values = grp.col, labels = pr) + 
        guides(color = F, fil = F) + 
        scale_y_continuous(sec.axis = dup_axis(
            label = NULL, name = NULL),
            breaks = seq(90, 108, 2), limits = c(90, 120)) + 
        theme_bw(base_size = 12) + 
        theme(plot.title = element_text(size = 10, color = "black", 
                                        face = "bold", hjust = 0.5),
              axis.title = element_text(size = 10, 
                                        color = "black", face = "bold"), 
              axis.text = element_text(size = 9, color = "black"),
              axis.ticks.length = unit(-0.05, "in"), 
              axis.text.y = element_text(margin = unit(c(0.3, 0.3, 
                                          0.3, 0.3), "cm"), size = 9),
              axis.text.x = element_text(margin = unit(c(0.3, 
                                          0.3, 0.3, 0.3), "cm")),
              text = element_text(size = 8, color = "black"),
              strip.text = element_text(size = 9, color = "black", face = "bold"),
              panel.grid = element_blank())

面积图

面积图是一种展示个体与整体的关系的统计图，更多用于时间序列变化的研究。

library(ggplot2)
library(dplyr)

dat %>% group_by(Fruit, Store) %>% 
summarize(mean_Weight = mean(Weight)) %>% 
        ggplot(aes(x = Store, group = Fruit)) + 
        geom_area(aes(y = mean_Weight, 
            fill = as.factor(Fruit)), position = "stack", linetype = "dashed") + 
        geom_hline(aes(yintercept = mean(mean_Weight)), color = "blue", 
            linetype = "dashed", size = 1) + 
        guides(fill = guide_legend(title = NULL)) + 
        theme_bw(base_size = 12) + 
        theme(plot.title = element_text(size = 10, 
                color = "black", face = "bold", hjust = 0.5), 
            axis.title = element_text(size = 10, 
                color = "black", face = "bold"), 
            axis.text = element_text(size = 9, color = "black"), 
            axis.ticks.length = unit(-0.05, "in"), 
            axis.text.y = element_text(margin = unit(c(0.3, 0.3, 
                0.3, 0.3), "cm"), size = 9), 
            axis.text.x = element_text(margin = unit(c(0.3, 
                0.3, 0.3, 0.3), "cm")), 
            text = element_text(size = 8, color = "black"), 
            strip.text = element_text(size = 9, 
                color = "black", face = "bold"), 
            panel.grid = element_blank())

热图

热图也是一种对数据分布情况可视化的统计图形，如下图表现得是数据差异性的具象化实例。一般用于样本聚类等可视化过程。在基因表达或者丰度表达差异研究中，热图既可以展现数据质量间的差异性，也可以用于聚类等。

library(ggplot2)

data <- as.data.frame(matrix(rnorm(9 * 10), 9, 10))
rownames(data) <- paste("Gene", 1:9, sep = "_")
colnames(data) <- paste("sample", 1:10, sep = "_")
data$ID <- rownames(data)
data_m <- tidyr::gather(data, sampleID, value, -ID)

ggplot(data_m, aes(x = sampleID, y = ID)) + 
    geom_tile(aes(fill = value)) + 
    scale_fill_gradient2("Expression", low = "green", high = "red", 
            mid = "black") + 
    xlab("samples") + 
    theme_classic() + 
    theme(axis.ticks = element_blank(), 
          axis.line = element_blank(), 
          panel.grid.major = element_blank(),
          legend.key = element_blank(), 
          axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1),
          legend.position = "top")

折线图

折线图是反应数据分布趋势的可视化图形，其本质和堆积图或者说面积图有些相似。

library(ggplot2)
library(dplyr)

grp.col <- c("#999999", "#E69F00", "#56B4E9")
dat.cln <- sampling::strata(dat, stratanames = "Fruit", 
    size = rep(round(nrow(dat) * 0.1/3, -1), 3), method = "srswor")

dat %>% slice(dat.cln$ID_unit) %>% 
    mutate(Year = as.character(rep(1996:2015, times = 3))) %>% 
    mutate(Year = factor(as.character(Year))) %>% 
    ggplot(aes(x = Year, y = Weight, linetype = Fruit, colour = Fruit, 
            shape = Fruit, fill = Fruit)) + 
        geom_line(aes(group = Fruit)) + 
        geom_point() + 
        scale_linetype_manual(values = c(1:3)) + 
        scale_shape_manual(values = c(19, 21, 23)) +
        scale_color_manual(values = grp.col, 
            labels = pr) + 
        scale_fill_manual(values = grp.col, labels = pr) + 
        theme_bw() + 
        theme(plot.title = element_text(size = 10, 
                color = "black", face = "bold", hjust = 0.5),
              axis.title = element_text(size = 10, color = "black", face = "bold"), 
              axis.text = element_text(size = 9, color = "black"),
              axis.ticks.length = unit(-0.05, "in"), 
              axis.text.y = element_text(margin = unit(c(0.3, 0.3, 
                0.3, 0.3), "cm"), size = 9),
              axis.text.x = element_text(margin = unit(c(0.3, 
                0.3, 0.3, 0.3), "cm")),
              text = element_text(size = 8, color = "black"),
              strip.text = element_text(size = 9, color = "black", face = "bold"),                    panel.grid = element_blank())

韦恩图

韦恩图是一种展示不同分组之间集合重叠区域的可视化图。

library(VennDiagram)

A <- sample(LETTERS, 18, replace = FALSE)
B <- sample(LETTERS, 18, replace = FALSE)
C <- sample(LETTERS, 18, replace = FALSE)
D <- sample(LETTERS, 18, replace = FALSE)

venn.diagram(x = list(A = A, D = D, B = B, C = C),
     filename = "Group4.png", height = 450, width = 450, 
     resolution = 300, imagetype = "png", col = "transparent", 
     fill = c("cornflowerblue", "green", "yellow", "darkorchid1"),
     alpha = 0.5, cex = 0.45, cat.cex = 0.45)

library(ggplot2)
library(UpSetR)

movies <- read.csv(system.file("extdata", "movies.csv", 
                package = "UpSetR"), header = T, sep = ";")
mutations <- read.csv(system.file("extdata", "mutations.csv", 
                package = "UpSetR"), header = T, sep = ",")

another.plot <- function(data, x, y) {
  round_any_new <- function(x, accuracy, f = round) {
    f(x/accuracy) * accuracy
  }
  data$decades <- round_any_new(as.integer(unlist(data[y])), 10, ceiling)
  data <- data[which(data$decades >= 1970), ]
  myplot <- (ggplot(data, aes_string(x = x)) + 
               geom_density(aes(fill = factor(decades)), alpha = 0.4) + 
               theme_bw() + 
               theme(plot.margin = unit(c(0, 0, 0, 0), "cm"), 
               legend.key.size = unit(0.4, "cm")))
}

upset(movies, main.bar.color = "black", 
      mb.ratio = c(0.5, 0.5), 
      queries = list(list(query = intersects, params = list("Drama"),
        color = "red", active = F), 
                list(query = intersects, params = list("Action", "Drama"), active = T),
                list(query = intersects, params = list("Drama", "Comedy", "Action"),
                    color = "orange",active = T)), 
      attribute.plots = list(gridrows = 50, 
           plots = list(list(plot = histogram, x = "ReleaseDate", queries = F), 
                   list(plot = scatter_plot, x = "ReleaseDate", 
                        y = "AvgRating", queries = T), 
                   list(plot = another.plot,x = "AvgRating", y = "ReleaseDate",
                        queries = F)),
                    ncols = 3)))

火山图

火山图通过两个属性Fold change和P value反应两组数据的差异性。

library(ggplot2)

data <- read.table(choose.files(),header = TRUE)
data$color <- ifelse(data$padj<0.05 & abs(data$log2FoldChange)>= 1,
                     ifelse(data$log2FoldChange > 1,'red','blue'),'gray')
color <- c(red = "red",gray = "gray",blue = "blue")

ggplot(data, aes(log2FoldChange, -log10(padj), col = color)) +
  geom_point() +
  theme_bw() +
  scale_color_manual(values = color) +
  labs(x="log2 (fold change)",y="-log10 (q-value)") +
  geom_hline(yintercept = -log10(0.05), lty=4,col="grey",lwd=0.6) +
  geom_vline(xintercept = c(-1, 1), lty=4,col="grey",lwd=0.6) +
  theme(legend.position = "none",
        panel.grid=element_blank(),
        axis.title = element_text(size = 16),
        axis.text = element_text(size = 14))

饼图

饼图是用于刻画分组间如频率等属性的相对关系图。

library(patternplot)

data <- read.csv(system.file("extdata", "vegetables.csv", 
                             package = "patternplot"))
pattern.type <- c("hdashes", "vdashes", "bricks")
pattern.color <- c("red3", "green3", "white")
background.color <- c("dodgerblue", "lightpink", "orange")

patternpie(group = data$group, pct = data$pct, 
    label = data$label, pattern.type = pattern.type,
    pattern.color = pattern.color, 
    background.color = background.color, frame.color = "grey40", 
    pixel = 0.3, pattern.line.size = 0.3, frame.size = 1.5, 
    label.size = 5, label.distance = 1.35) + 
  ggtitle("(B) Colors with Patterns"))

密度曲线图

密度曲线图反应的是数据在不同区间的密度分布情况，和概率密度函数PDF曲线类似。

library(ggplot2)
library(plyr)

set.seed(1234)
df <- data.frame(
  sex=factor(rep(c("F", "M"), each=200)),
  weight=round(c(rnorm(200, mean=55, sd=5),
                 rnorm(200, mean=65, sd=5)))
)
mu <- ddply(df, "sex", summarise, grp.mean=mean(weight))

ggplot(df, aes(x=weight, fill=sex)) +
  geom_histogram(aes(y=..density..), alpha=0.5, 
                 position="identity") +
  geom_density(alpha=0.4) +
  geom_vline(data=mu, aes(xintercept=grp.mean, color=sex),
             linetype="dashed") + 
  scale_color_grey() + 
  theme_classic()+
  theme(legend.position="top")

参考

直方图定义

R可视化：基础图形可视化（一）

基础图形可视化

图形类型及其使用意义

散点图

直方图

箱线图

面积图

热图

相关图

折线图

韦恩图

火山图

饼图

密度曲线图

参考