实战 - 简书

首先在NCBI里找到该文章的GSE编号：GSE5282

step_1

数据下载

> rm(list = ls())
> options(stringsAsFactors = F)
> library(GEOquery)
> eSet <- getGEO("GSE5282", 
+                destdir = '.', 
+                getGPL = F)
Found 1 file(s)
GSE5282_series_matrix.txt.gz
Using locally cached version: ./GSE5282_series_matrix.txt.gz
Parsed with column specification:
cols(
  ID_REF = col_character(),
  GSM75265 = col_double(),
  GSM75267 = col_double(),
  GSM75268 = col_double(),
  GSM75269 = col_double(),
  GSM75270 = col_double(),
  GSM75271 = col_double(),
  GSM75272 = col_double(),
  GSM75273 = col_double(),
  GSM75274 = col_double(),
  GSM75275 = col_double(),
  GSM75276 = col_double(),
  GSM75277 = col_double()
)

提取表达矩阵

> exp <- exprs(eSet[[1]]) #expr这个函数
> exp[1:4,1:4]
           GSM75265 GSM75267 GSM75268 GSM75269
1367452_at  22866.6  24411.0  29900.6  23920.9
1367453_at  12933.1  14618.1  13697.1  10271.8
1367454_at  13152.8  12829.3  14364.1  15371.0
1367455_at  22381.5  22356.9  21329.9  25713.4
> head(exp)
           GSM75265 GSM75267 GSM75268 GSM75269 GSM75270 GSM75271 GSM75272 GSM75273
1367452_at  22866.6  24411.0  29900.6  23920.9  26858.7  31436.0  17297.6  17692.9
1367453_at  12933.1  14618.1  13697.1  10271.8  11392.7   9771.2   9422.3   8778.6
1367454_at  13152.8  12829.3  14364.1  15371.0  15560.3  18875.7  18753.0  16089.0
1367455_at  22381.5  22356.9  21329.9  25713.4  26994.4  20608.0  20162.0  21117.5
1367456_at  29495.6  29813.2  32294.8  18181.0  18743.0  20218.1  15768.2  15723.4
1367457_at   6507.2   6230.5   6680.4   8407.6   8015.3   6628.6   8302.6   7684.0
           GSM75274 GSM75275 GSM75276 GSM75277
1367452_at  19851.8  14508.7  18228.9  25956.7
1367453_at   7200.7  13336.1  13385.0  24065.1
1367454_at  20180.8  26553.1  29002.3  22035.7
1367455_at  20247.1  17793.0  23830.1  26177.2
1367456_at  16547.5  25720.0  29269.4  24553.4
1367457_at   8506.3  10941.2  12325.5   8090.8

提取临床信息,用pData函数

> pd <- pData(eSet[[1]])
> save(pd,exp,file = "step1output.Rdata")
> eSet[[1]]@annotation
[1] "GPL1355"

step_2 获得分组信息

> class(pd)
[1] "data.frame"
> dim(pd)
[1] 12 34
> colnames(pd)
 [1] "title"                   "geo_accession"           "status"                 
 [4] "submission_date"         "last_update_date"        "type"                   
 [7] "channel_count"           "source_name_ch1"         "organism_ch1"           
[10] "characteristics_ch1"     "molecule_ch1"            "extract_protocol_ch1"   
[13] "label_ch1"               "label_protocol_ch1"      "taxid_ch1"              
[16] "hyb_protocol"            "scan_protocol"           "description"            
[19] "data_processing"         "platform_id"             "contact_name"           
[22] "contact_email"           "contact_phone"           "contact_fax"            
[25] "contact_laboratory"      "contact_department"      "contact_institute"      
[28] "contact_address"         "contact_city"            "contact_state"          
[31] "contact_zip/postal_code" "contact_country"         "supplementary_file"     
[34] "data_row_count"         
> pd$title
 [1] "EGF 4h replicate 1"              "EGF 4h replicate 2"             
 [3] "EGF 4h replicate 3"              "Control for EGF 4h replicate 1" 
 [5] "Control for EGF 4h replicate 2"  "Control for EGF 4h replicate 3" 
 [7] "Control for EGF 12h replicate 1" "Control for EGF 12h replicate 2"
 [9] "Control for EGF 12h replicate 3" "EGF 12h replicate 1"            
[11] "EGF 12h replicate 2"             "EGF12h replicate 3"             
> group_list_1 <- rep(c('control','EGF 12'),c(3,2))
> save(group_list_1,file = "step2output.Rdata")

step_3 主成分分析

挑出数据

exp1 <-exp[,7:11]###即三组control,两组EGF 12

查看数据

> dim(exp1)
[1] 31099     5
> dat1 <- as.data.frame(t(exp1))###转置列为基因名，行为样本
> dat1[1:4,1:4]
         1367452_at 1367453_at 1367454_at 1367455_at
GSM75272      17298       9422      18753      20162
GSM75273      17693       8779      16089      21118
GSM75274      19852       7201      20181      20247
GSM75275      14509      13336      26553      17793
> dim(exp1)
[1] 31099     5
> dim(dat1)
[1]     5 31099
> dat1 <- cbind(dat1,group_list_1)###将分组信息加入
> dim(dat1)
[1]     5 31100

PCA的统一操作走起

> library(FactoMineR)#画主成分分析图需要加载这两个包
> library(factoextra)

进行主成分分析

> dat1.pca <- PCA(dat1[,-ncol(dat1)], graph = FALSE)

画PCA图

> fviz_pca_ind(dat1.pca,
+              geom.ind = "point", # show points only (nbut not "text")
+              col.ind = dat1$group_list_1, # color by groups
+              palette = c("#00AFBB", "#E7B800"),##https://www.114la.com/other/rgb.htm
+              addEllipses = TRUE, # Concentration ellipses
+              legend.title = "Groups"
+ )
Too few points to calculate an ellipse
Too few points to calculate an ellipse
> ggsave('all_samples_PCA.png')
Saving 9.03 x 5.6 in image
Too few points to calculate an ellipse
Too few points to calculate an ellipse

image.png

step_4差异分析

单个基因差异分析

> head(exp1)
           GSM75272 GSM75273 GSM75274 GSM75275 GSM75276
1367452_at    17298    17693    19852    14509    18229
1367453_at     9422     8779     7201    13336    13385
1367454_at    18753    16089    20181    26553    29002
1367455_at    20162    21118    20247    17793    23830
1367456_at    15768    15723    16548    25720    29269
1367457_at     8303     7684     8506    10941    12326
> table(group_list_1)
group_list_1
control  EGF 12 
      3       2 
> boxplot(exp1[1,]~group_list_1)

image.png

> bp=function(g){         #定义一个函数g，函数为{}里的内容
+   library(ggpubr)
+   df=data.frame(gene=g,group=group_list_1)
+   p <- ggboxplot(df, x = "group", y = "gene",
+                  color = "group", palette = "jco",
+                  add = "jitter")
+   #  Add p-value
+   p + stat_compare_means(label.y = 8)
+ }
> bp(exp1[1,]) ## 调用上面定义好的函数，避免同样的绘图代码重复多次敲。
> bp(exp1[2,])
> bp(exp1[3,])
> bp(exp1[4,])

image.png

差异分析，用limma包来做

需要表达矩阵和group_list，其他都不要动

> library(limma)
> design=model.matrix(~factor(group_list_1))
> fit=lmFit(exp1,design)
> fit=eBayes(fit)
> dim(fit)
[1] 31099     2

差异基因排名

> deg=topTable(fit,coef=2,number = Inf)
> head(deg)
             logFC AveExpr     t   P.Value adj.P.Val      B
1384934_at    9942    5460 178.1 7.191e-09 0.0001338 -4.585
1393927_at    7620    3152 170.2 8.604e-09 0.0001338 -4.585
1389351_at   11065    6215 139.7 1.879e-08 0.0001796 -4.585
1372569_at    3797    3203 131.1 2.411e-08 0.0001796 -4.585
1370426_a_at  8926   13513 121.6 3.244e-08 0.0001796 -4.585
1370085_at   19271   11247 119.6 3.466e-08 0.0001796 -4.585
> bp(exp1[rownames(deg)[1],])
> bp(exp1[rownames(deg)[2],])

image.png

为deg数据框添加几列

1.加probe_id列，把行名变成一列

> library(dplyr)
> deg <- mutate(deg,probe_id=rownames(deg))
> #tibble::rownames_to_column()
> head(deg)
  logFC AveExpr     t   P.Value adj.P.Val      B     probe_id
1  9942    5460 178.1 7.191e-09 0.0001338 -4.585   1384934_at
2  7620    3152 170.2 8.604e-09 0.0001338 -4.585   1393927_at
3 11065    6215 139.7 1.879e-08 0.0001796 -4.585   1389351_at
4  3797    3203 131.1 2.411e-08 0.0001796 -4.585   1372569_at
5  8926   13513 121.6 3.244e-08 0.0001796 -4.585 1370426_a_at
6 19271   11247 119.6 3.466e-08 0.0001796 -4.585   1370085_at

2.加symbol列，火山图要用

id转换，查找芯片平台对应的包
http://www.bio-info-trainee.com/1399.html

> library(rat2302.db)
> ls("package:rat2302.db")
 [1] "rat2302"              "rat2302.db"           "rat2302_dbconn"      
 [4] "rat2302_dbfile"       "rat2302_dbInfo"       "rat2302_dbschema"    
 [7] "rat2302ACCNUM"        "rat2302ALIAS2PROBE"   "rat2302CHR"          
[10] "rat2302CHRLENGTHS"    "rat2302CHRLOC"        "rat2302CHRLOCEND"    
[13] "rat2302ENSEMBL"       "rat2302ENSEMBL2PROBE" "rat2302ENTREZID"     
[16] "rat2302ENZYME"        "rat2302ENZYME2PROBE"  "rat2302GENENAME"     
[19] "rat2302GO"            "rat2302GO2ALLPROBES"  "rat2302GO2PROBE"     
[22] "rat2302MAPCOUNTS"     "rat2302ORGANISM"      "rat2302ORGPKG"       
[25] "rat2302PATH"          "rat2302PATH2PROBE"    "rat2302PFAM"         
[28] "rat2302PMID"          "rat2302PMID2PROBE"    "rat2302PROSITE"      
[31] "rat2302REFSEQ"        "rat2302SYMBOL"        "rat2302UNIGENE"      
[34] "rat2302UNIPROT"      
> ids <- toTable(rat2302SYMBOL)
> head(ids)
    probe_id symbol
1 1367452_at  Sumo2
2 1367453_at  Cdc37
3 1367454_at  Copb2
4 1367455_at    Vcp
5 1367457_at  Becn1
6 1367458_at Lypla2

merge

> deg <- inner_join(deg,ids,by="probe_id")
> head(deg)
  logFC AveExpr     t   P.Value adj.P.Val      B     probe_id  symbol
1  9942    5460 178.1 7.191e-09 0.0001338 -4.585   1384934_at Slc41a2
2 11065    6215 139.7 1.879e-08 0.0001796 -4.585   1389351_at Lrrfip1
3  3797    3203 131.1 2.411e-08 0.0001796 -4.585   1372569_at    Fhl3
4  8926   13513 121.6 3.244e-08 0.0001796 -4.585 1370426_a_at  Atp2a2
5 19271   11247 119.6 3.466e-08 0.0001796 -4.585   1370085_at   Rasa1
6 15631    6872 111.4 4.593e-08 0.0002041 -4.585   1383846_at    Dok5

3.加change列：上调或下调，火山图要用

> logFC_t <- 1 #不同的阈值，筛选到的差异基因数量就不一样，后面的超几何分布检验结果就大相径庭。
> change=ifelse(deg$P.Value>0.01,'stable', 
+               ifelse( deg$logFC >logFC_t,'up', 
+                       ifelse( deg$logFC < -logFC_t,'down','stable') )
+ )
> deg <- mutate(deg,change)
> head(deg)
  logFC AveExpr     t   P.Value adj.P.Val      B     probe_id  symbol change
1  9942    5460 178.1 7.191e-09 0.0001338 -4.585   1384934_at Slc41a2     up
2 11065    6215 139.7 1.879e-08 0.0001796 -4.585   1389351_at Lrrfip1     up
3  3797    3203 131.1 2.411e-08 0.0001796 -4.585   1372569_at    Fhl3     up
4  8926   13513 121.6 3.244e-08 0.0001796 -4.585 1370426_a_at  Atp2a2     up
5 19271   11247 119.6 3.466e-08 0.0001796 -4.585   1370085_at   Rasa1     up
6 15631    6872 111.4 4.593e-08 0.0002041 -4.585   1383846_at    Dok5     up
> table(deg$change)

  down stable     up 
  2214  16726   1978

火山图

> head(deg)
  logFC AveExpr     t   P.Value adj.P.Val      B     probe_id  symbol change
1  9942    5460 178.1 7.191e-09 0.0001338 -4.585   1384934_at Slc41a2     up
2 11065    6215 139.7 1.879e-08 0.0001796 -4.585   1389351_at Lrrfip1     up
3  3797    3203 131.1 2.411e-08 0.0001796 -4.585   1372569_at    Fhl3     up
4  8926   13513 121.6 3.244e-08 0.0001796 -4.585 1370426_a_at  Atp2a2     up
5 19271   11247 119.6 3.466e-08 0.0001796 -4.585   1370085_at   Rasa1     up
6 15631    6872 111.4 4.593e-08 0.0002041 -4.585   1383846_at    Dok5     up
> plot(deg$logFC,-log10(deg$P.Value))

image.png

> library(dplyr)
> dat1 <- mutate(deg,v=-log10(P.Value))
> head(dat1)
  logFC AveExpr     t   P.Value adj.P.Val      B     probe_id  symbol change     v
1  9942    5460 178.1 7.191e-09 0.0001338 -4.585   1384934_at Slc41a2     up 8.143
2 11065    6215 139.7 1.879e-08 0.0001796 -4.585   1389351_at Lrrfip1     up 7.726
3  3797    3203 131.1 2.411e-08 0.0001796 -4.585   1372569_at    Fhl3     up 7.618
4  8926   13513 121.6 3.244e-08 0.0001796 -4.585 1370426_a_at  Atp2a2     up 7.489
5 19271   11247 119.6 3.466e-08 0.0001796 -4.585   1370085_at   Rasa1     up 7.460
6 15631    6872 111.4 4.593e-08 0.0002041 -4.585   1383846_at    Dok5     up 7.338

ggpubr绘图

> library(ggpubr)
> ggscatter(dat1, x = "logFC", y = "v",size=0.5,color = "change")

image.png

> ggscatter(dat1, x = "logFC", y = "v", color = "change",size = 0.5,
+           label = "symbol", repel = T,
+           label.select = dat1$symbol[1:30] ,
+           #label.select = c('CD36','DUSP6'), #挑选一些基因在图中显示出来
+           palette = c("#00AFBB", "#999999", "#FC4E07") )

image.png

绘制热图

挑出高表达基因

> library(pheatmap)
> table(deg$change)

  down stable     up 
  2214  16726   1978 
> x <- deg$change
> names(x) <- deg$probe_id
> table(x=='up')

FALSE  TRUE 
18940  1978 
> table(x[x=='up'])

  up 
1978 
> cg <- names(x[x=='up'])

归一化和阈值设置

> n=t(scale(t(exp1[cg,])))
> thr=2
> n[n>thr]=thr
> n[n< -thr]= -thr
> n[1:4,1:4]
             GSM75272 GSM75273 GSM75274 GSM75275
1384934_at    -0.7193  -0.7427  -0.7288    1.103
1389351_at    -0.7207  -0.7487  -0.7213    1.085
1372569_at    -0.7330  -0.7321  -0.7258    1.090
1370426_a_at  -0.7428  -0.7250  -0.7228    1.114

显示分组，绘制热图

> ac=data.frame(group=group_list_1)
> rownames(ac)=colnames(n) 
> pheatmap(n,show_colnames =T,show_rownames = F,cluster_col = F,annotation_col = ac,color = colorRampPalette(c("navy", "white", "firebrick3"))(50))

image.png

实战

step_1

数据下载

提取表达矩阵

提取临床信息,用pData函数

step_2 获得分组信息

step_3 主成分分析

挑出数据

查看数据

PCA的统一操作走起

进行主成分分析

画PCA图

step_4差异分析

单个基因差异分析

差异分析，用limma包来做

需要表达矩阵和group_list，其他都不要动

差异基因排名

为deg数据框添加几列

1.加probe_id列，把行名变成一列

2.加symbol列，火山图要用

merge

3.加change列：上调或下调，火山图要用

火山图

ggpubr绘图

绘制热图

挑出高表达基因

归一化和阈值设置

显示分组，绘制热图

推荐阅读更多精彩内容