首先在NCBI里找到该文章的GSE编号:GSE5282
step_1
数据下载
> rm(list = ls())
> options(stringsAsFactors = F)
> library(GEOquery)
> eSet <- getGEO("GSE5282",
+ destdir = '.',
+ getGPL = F)
Found 1 file(s)
GSE5282_series_matrix.txt.gz
Using locally cached version: ./GSE5282_series_matrix.txt.gz
Parsed with column specification:
cols(
ID_REF = col_character(),
GSM75265 = col_double(),
GSM75267 = col_double(),
GSM75268 = col_double(),
GSM75269 = col_double(),
GSM75270 = col_double(),
GSM75271 = col_double(),
GSM75272 = col_double(),
GSM75273 = col_double(),
GSM75274 = col_double(),
GSM75275 = col_double(),
GSM75276 = col_double(),
GSM75277 = col_double()
)
提取表达矩阵
> exp <- exprs(eSet[[1]]) #expr这个函数
> exp[1:4,1:4]
GSM75265 GSM75267 GSM75268 GSM75269
1367452_at 22866.6 24411.0 29900.6 23920.9
1367453_at 12933.1 14618.1 13697.1 10271.8
1367454_at 13152.8 12829.3 14364.1 15371.0
1367455_at 22381.5 22356.9 21329.9 25713.4
> head(exp)
GSM75265 GSM75267 GSM75268 GSM75269 GSM75270 GSM75271 GSM75272 GSM75273
1367452_at 22866.6 24411.0 29900.6 23920.9 26858.7 31436.0 17297.6 17692.9
1367453_at 12933.1 14618.1 13697.1 10271.8 11392.7 9771.2 9422.3 8778.6
1367454_at 13152.8 12829.3 14364.1 15371.0 15560.3 18875.7 18753.0 16089.0
1367455_at 22381.5 22356.9 21329.9 25713.4 26994.4 20608.0 20162.0 21117.5
1367456_at 29495.6 29813.2 32294.8 18181.0 18743.0 20218.1 15768.2 15723.4
1367457_at 6507.2 6230.5 6680.4 8407.6 8015.3 6628.6 8302.6 7684.0
GSM75274 GSM75275 GSM75276 GSM75277
1367452_at 19851.8 14508.7 18228.9 25956.7
1367453_at 7200.7 13336.1 13385.0 24065.1
1367454_at 20180.8 26553.1 29002.3 22035.7
1367455_at 20247.1 17793.0 23830.1 26177.2
1367456_at 16547.5 25720.0 29269.4 24553.4
1367457_at 8506.3 10941.2 12325.5 8090.8
提取临床信息,用pData函数
> pd <- pData(eSet[[1]])
> save(pd,exp,file = "step1output.Rdata")
> eSet[[1]]@annotation
[1] "GPL1355"
step_2 获得分组信息
> class(pd)
[1] "data.frame"
> dim(pd)
[1] 12 34
> colnames(pd)
[1] "title" "geo_accession" "status"
[4] "submission_date" "last_update_date" "type"
[7] "channel_count" "source_name_ch1" "organism_ch1"
[10] "characteristics_ch1" "molecule_ch1" "extract_protocol_ch1"
[13] "label_ch1" "label_protocol_ch1" "taxid_ch1"
[16] "hyb_protocol" "scan_protocol" "description"
[19] "data_processing" "platform_id" "contact_name"
[22] "contact_email" "contact_phone" "contact_fax"
[25] "contact_laboratory" "contact_department" "contact_institute"
[28] "contact_address" "contact_city" "contact_state"
[31] "contact_zip/postal_code" "contact_country" "supplementary_file"
[34] "data_row_count"
> pd$title
[1] "EGF 4h replicate 1" "EGF 4h replicate 2"
[3] "EGF 4h replicate 3" "Control for EGF 4h replicate 1"
[5] "Control for EGF 4h replicate 2" "Control for EGF 4h replicate 3"
[7] "Control for EGF 12h replicate 1" "Control for EGF 12h replicate 2"
[9] "Control for EGF 12h replicate 3" "EGF 12h replicate 1"
[11] "EGF 12h replicate 2" "EGF12h replicate 3"
> group_list_1 <- rep(c('control','EGF 12'),c(3,2))
> save(group_list_1,file = "step2output.Rdata")
step_3 主成分分析
挑出数据
exp1 <-exp[,7:11]###即三组control,两组EGF 12
查看数据
> dim(exp1)
[1] 31099 5
> dat1 <- as.data.frame(t(exp1))###转置列为基因名,行为样本
> dat1[1:4,1:4]
1367452_at 1367453_at 1367454_at 1367455_at
GSM75272 17298 9422 18753 20162
GSM75273 17693 8779 16089 21118
GSM75274 19852 7201 20181 20247
GSM75275 14509 13336 26553 17793
> dim(exp1)
[1] 31099 5
> dim(dat1)
[1] 5 31099
> dat1 <- cbind(dat1,group_list_1)###将分组信息加入
> dim(dat1)
[1] 5 31100
PCA的统一操作走起
> library(FactoMineR)#画主成分分析图需要加载这两个包
> library(factoextra)
进行主成分分析
> dat1.pca <- PCA(dat1[,-ncol(dat1)], graph = FALSE)
画PCA图
> fviz_pca_ind(dat1.pca,
+ geom.ind = "point", # show points only (nbut not "text")
+ col.ind = dat1$group_list_1, # color by groups
+ palette = c("#00AFBB", "#E7B800"),##https://www.114la.com/other/rgb.htm
+ addEllipses = TRUE, # Concentration ellipses
+ legend.title = "Groups"
+ )
Too few points to calculate an ellipse
Too few points to calculate an ellipse
> ggsave('all_samples_PCA.png')
Saving 9.03 x 5.6 in image
Too few points to calculate an ellipse
Too few points to calculate an ellipse
image.png
step_4差异分析
单个基因差异分析
> head(exp1)
GSM75272 GSM75273 GSM75274 GSM75275 GSM75276
1367452_at 17298 17693 19852 14509 18229
1367453_at 9422 8779 7201 13336 13385
1367454_at 18753 16089 20181 26553 29002
1367455_at 20162 21118 20247 17793 23830
1367456_at 15768 15723 16548 25720 29269
1367457_at 8303 7684 8506 10941 12326
> table(group_list_1)
group_list_1
control EGF 12
3 2
> boxplot(exp1[1,]~group_list_1)
image.png
> bp=function(g){ #定义一个函数g,函数为{}里的内容
+ library(ggpubr)
+ df=data.frame(gene=g,group=group_list_1)
+ p <- ggboxplot(df, x = "group", y = "gene",
+ color = "group", palette = "jco",
+ add = "jitter")
+ # Add p-value
+ p + stat_compare_means(label.y = 8)
+ }
> bp(exp1[1,]) ## 调用上面定义好的函数,避免同样的绘图代码重复多次敲。
> bp(exp1[2,])
> bp(exp1[3,])
> bp(exp1[4,])
image.png
差异分析,用limma包来做
需要表达矩阵和group_list,其他都不要动
> library(limma)
> design=model.matrix(~factor(group_list_1))
> fit=lmFit(exp1,design)
> fit=eBayes(fit)
> dim(fit)
[1] 31099 2
差异基因排名
> deg=topTable(fit,coef=2,number = Inf)
> head(deg)
logFC AveExpr t P.Value adj.P.Val B
1384934_at 9942 5460 178.1 7.191e-09 0.0001338 -4.585
1393927_at 7620 3152 170.2 8.604e-09 0.0001338 -4.585
1389351_at 11065 6215 139.7 1.879e-08 0.0001796 -4.585
1372569_at 3797 3203 131.1 2.411e-08 0.0001796 -4.585
1370426_a_at 8926 13513 121.6 3.244e-08 0.0001796 -4.585
1370085_at 19271 11247 119.6 3.466e-08 0.0001796 -4.585
> bp(exp1[rownames(deg)[1],])
> bp(exp1[rownames(deg)[2],])
image.png
image.png
为deg数据框添加几列
1.加probe_id列,把行名变成一列
> library(dplyr)
> deg <- mutate(deg,probe_id=rownames(deg))
> #tibble::rownames_to_column()
> head(deg)
logFC AveExpr t P.Value adj.P.Val B probe_id
1 9942 5460 178.1 7.191e-09 0.0001338 -4.585 1384934_at
2 7620 3152 170.2 8.604e-09 0.0001338 -4.585 1393927_at
3 11065 6215 139.7 1.879e-08 0.0001796 -4.585 1389351_at
4 3797 3203 131.1 2.411e-08 0.0001796 -4.585 1372569_at
5 8926 13513 121.6 3.244e-08 0.0001796 -4.585 1370426_a_at
6 19271 11247 119.6 3.466e-08 0.0001796 -4.585 1370085_at
2.加symbol列,火山图要用
id转换,查找芯片平台对应的包
http://www.bio-info-trainee.com/1399.html
> library(rat2302.db)
> ls("package:rat2302.db")
[1] "rat2302" "rat2302.db" "rat2302_dbconn"
[4] "rat2302_dbfile" "rat2302_dbInfo" "rat2302_dbschema"
[7] "rat2302ACCNUM" "rat2302ALIAS2PROBE" "rat2302CHR"
[10] "rat2302CHRLENGTHS" "rat2302CHRLOC" "rat2302CHRLOCEND"
[13] "rat2302ENSEMBL" "rat2302ENSEMBL2PROBE" "rat2302ENTREZID"
[16] "rat2302ENZYME" "rat2302ENZYME2PROBE" "rat2302GENENAME"
[19] "rat2302GO" "rat2302GO2ALLPROBES" "rat2302GO2PROBE"
[22] "rat2302MAPCOUNTS" "rat2302ORGANISM" "rat2302ORGPKG"
[25] "rat2302PATH" "rat2302PATH2PROBE" "rat2302PFAM"
[28] "rat2302PMID" "rat2302PMID2PROBE" "rat2302PROSITE"
[31] "rat2302REFSEQ" "rat2302SYMBOL" "rat2302UNIGENE"
[34] "rat2302UNIPROT"
> ids <- toTable(rat2302SYMBOL)
> head(ids)
probe_id symbol
1 1367452_at Sumo2
2 1367453_at Cdc37
3 1367454_at Copb2
4 1367455_at Vcp
5 1367457_at Becn1
6 1367458_at Lypla2
merge
> deg <- inner_join(deg,ids,by="probe_id")
> head(deg)
logFC AveExpr t P.Value adj.P.Val B probe_id symbol
1 9942 5460 178.1 7.191e-09 0.0001338 -4.585 1384934_at Slc41a2
2 11065 6215 139.7 1.879e-08 0.0001796 -4.585 1389351_at Lrrfip1
3 3797 3203 131.1 2.411e-08 0.0001796 -4.585 1372569_at Fhl3
4 8926 13513 121.6 3.244e-08 0.0001796 -4.585 1370426_a_at Atp2a2
5 19271 11247 119.6 3.466e-08 0.0001796 -4.585 1370085_at Rasa1
6 15631 6872 111.4 4.593e-08 0.0002041 -4.585 1383846_at Dok5
3.加change列:上调或下调,火山图要用
> logFC_t <- 1 #不同的阈值,筛选到的差异基因数量就不一样,后面的超几何分布检验结果就大相径庭。
> change=ifelse(deg$P.Value>0.01,'stable',
+ ifelse( deg$logFC >logFC_t,'up',
+ ifelse( deg$logFC < -logFC_t,'down','stable') )
+ )
> deg <- mutate(deg,change)
> head(deg)
logFC AveExpr t P.Value adj.P.Val B probe_id symbol change
1 9942 5460 178.1 7.191e-09 0.0001338 -4.585 1384934_at Slc41a2 up
2 11065 6215 139.7 1.879e-08 0.0001796 -4.585 1389351_at Lrrfip1 up
3 3797 3203 131.1 2.411e-08 0.0001796 -4.585 1372569_at Fhl3 up
4 8926 13513 121.6 3.244e-08 0.0001796 -4.585 1370426_a_at Atp2a2 up
5 19271 11247 119.6 3.466e-08 0.0001796 -4.585 1370085_at Rasa1 up
6 15631 6872 111.4 4.593e-08 0.0002041 -4.585 1383846_at Dok5 up
> table(deg$change)
down stable up
2214 16726 1978
火山图
> head(deg)
logFC AveExpr t P.Value adj.P.Val B probe_id symbol change
1 9942 5460 178.1 7.191e-09 0.0001338 -4.585 1384934_at Slc41a2 up
2 11065 6215 139.7 1.879e-08 0.0001796 -4.585 1389351_at Lrrfip1 up
3 3797 3203 131.1 2.411e-08 0.0001796 -4.585 1372569_at Fhl3 up
4 8926 13513 121.6 3.244e-08 0.0001796 -4.585 1370426_a_at Atp2a2 up
5 19271 11247 119.6 3.466e-08 0.0001796 -4.585 1370085_at Rasa1 up
6 15631 6872 111.4 4.593e-08 0.0002041 -4.585 1383846_at Dok5 up
> plot(deg$logFC,-log10(deg$P.Value))
image.png
> library(dplyr)
> dat1 <- mutate(deg,v=-log10(P.Value))
> head(dat1)
logFC AveExpr t P.Value adj.P.Val B probe_id symbol change v
1 9942 5460 178.1 7.191e-09 0.0001338 -4.585 1384934_at Slc41a2 up 8.143
2 11065 6215 139.7 1.879e-08 0.0001796 -4.585 1389351_at Lrrfip1 up 7.726
3 3797 3203 131.1 2.411e-08 0.0001796 -4.585 1372569_at Fhl3 up 7.618
4 8926 13513 121.6 3.244e-08 0.0001796 -4.585 1370426_a_at Atp2a2 up 7.489
5 19271 11247 119.6 3.466e-08 0.0001796 -4.585 1370085_at Rasa1 up 7.460
6 15631 6872 111.4 4.593e-08 0.0002041 -4.585 1383846_at Dok5 up 7.338
ggpubr绘图
> library(ggpubr)
> ggscatter(dat1, x = "logFC", y = "v",size=0.5,color = "change")
image.png
> ggscatter(dat1, x = "logFC", y = "v", color = "change",size = 0.5,
+ label = "symbol", repel = T,
+ label.select = dat1$symbol[1:30] ,
+ #label.select = c('CD36','DUSP6'), #挑选一些基因在图中显示出来
+ palette = c("#00AFBB", "#999999", "#FC4E07") )
image.png
绘制热图
挑出高表达基因
> library(pheatmap)
> table(deg$change)
down stable up
2214 16726 1978
> x <- deg$change
> names(x) <- deg$probe_id
> table(x=='up')
FALSE TRUE
18940 1978
> table(x[x=='up'])
up
1978
> cg <- names(x[x=='up'])
归一化和阈值设置
> n=t(scale(t(exp1[cg,])))
> thr=2
> n[n>thr]=thr
> n[n< -thr]= -thr
> n[1:4,1:4]
GSM75272 GSM75273 GSM75274 GSM75275
1384934_at -0.7193 -0.7427 -0.7288 1.103
1389351_at -0.7207 -0.7487 -0.7213 1.085
1372569_at -0.7330 -0.7321 -0.7258 1.090
1370426_a_at -0.7428 -0.7250 -0.7228 1.114
显示分组,绘制热图
> ac=data.frame(group=group_list_1)
> rownames(ac)=colnames(n)
> pheatmap(n,show_colnames =T,show_rownames = F,cluster_col = F,annotation_col = ac,color = colorRampPalette(c("navy", "white", "firebrick3"))(50))
image.png