【年末福利】以单基因生信分析切入,分享ssgsva、GSEA等套路性分析的实现

1. 套路一:观察重要表型与感兴趣基因之间的关系。

比如观察KRAS有无突变两组之间,感兴趣基因的表达是否有差异。

1.1 整理数据:提取感兴趣基因和临床表型数据

expr <- exprs[exprs$symbol %in% c("Your Interesting Gene Symbol"),]
rownames(expr) <- expr$symbol
expr <- expr[,-1]
expr <- as.data.frame(t(expr))
str(expr)
expr$Source.Name <- rownames(expr)
comData <- merge(expr,pheno,by="Source.Name") #合并表达和表型数据

1.2 以表型为分组,观察感兴趣基因之间是否具有差异表达。

ibrary(reshape2)
library(gridExtra)
library(ggthemes)
library(ggplot2)
library(ggpubr)
library(ggsignif)
table(comData$Characteristics.krasmut.)
df <- comData[comData$Characteristics.krasmut. !="not available",]
table(df$Characteristics.krasmut.)

plot <- ggplot(df,aes(x=Characteristics.krasmut., y="Your Gene", fill=Characteristics.krasmut.))+
  geom_boxplot(aes(fill=Characteristics.krasmut.), outlier.shape = NA)+
  geom_jitter(size=0.1, alpha=0.2, position=position_jitter(0.3))+ 
  stat_compare_means(label = "p.signif", method = "t.test", label.x = 1.5, label.y.npc = "top", hide.ns = TRUE, col="red")+
  theme_pubclean() +
  scale_x_discrete(name="")+  
  theme(plot.title = element_text(size = 13, face = "bold"),
        legend.position = "right",
        text = element_text(size = 13),
        axis.title = element_text(face="bold"),
        axis.text.x=element_text(size = 12, angle = 45, hjust = 1.0, vjust = 1.0)) 
plot

2. 套路二:生存分析

library(survival)
library(survminer)
library(ggthemes)
## remove the "not available" data 
df <- comData[comData$Characteristics.os.delay. !="not available",]
df$os <- as.numeric(df$Characteristics.os.delay.)
df$event <- as.numeric(df$Characteristics.os.event.)

survi_cutoff <- surv_cutpoint(data = df, time = "os", event = "event",
                              variables = "Your gene") 
summary(survi_cutoff)
sur_cut <- surv_categorize(survi_cutoff)
head(sur_cut)
sur_cut$os <- as.numeric(sur_cut$os)
sur_cut$event <- as.numeric(sur_cut$event)
fit <- survfit(Surv("os", "event") ~Your gene, data = sur_cut)
ggsurvplot(fit, data = sur_cat, conf.int = T,  # 95%CI
           size = 0.6,  # change line size
           pval = T,  # p-value of log-rank test
           risk.table = TRUE,  
           xlab=c("Overall survival(months)"), 
           ggtheme = theme_gray(), # theme_minimal() theme_gray() theme_classic() theme_bw()
           legend.title="Your gene", legend.labs=c("High","Low") )

3. 套路三:ssgsva

3.1 make genelist and load expression data

## genelist 
rm(list = ls())
library(GSVA)
library(GSEABase)
library(msigdbr)
library(clusterProfiler)
library(org.Hs.eg.db)
library(enrichplot)
library(limma)
library(readxl)
genelist <- read_excel("genelist.xlsx")
genelist <- as.data.frame(genelist)
pathway_list <- lapply(genelist, function(x) {
  unique(na.omit(x)) 
})

## expression data
load("expr.Rdata")
row.names(expr) <- expr$symbol
exprs <- dplyr::select(expr, -symbol)
gsva <- gsva(as.matrix(exprs), pathway_list,method='ssgsea',
                       kcdf='Gaussian',abs.ranking=TRUE)
write.csv(gsva, file = "gsva.csv")

## setting group according your interesting gene expression
df <- exprs["Your Gene",]
df <- as.data.frame(t(df))
df$group <- ifelse(df$Your Gene > median(df$Your Gene), "High", "Low")
table(df$group)
gsva_matrix <- t(gsva)

## Make sure the samples are in the correct order
df <- df[rownames(gsva),]
gsva_matrix <- cbind(gsva_matrix,df)

3.2 make analysis according the gsva result

library(reshape2)
library(gridExtra)
library(ggthemes)
library(ggplot2)
library(ggpubr)
library(ggsignif)
table(gsva_matrix$group)
rt <- gsva_matrix[,c(1:29,31)]
df <- melt(rt, id.vars = "group", variable.name = "immuneCells",
           value.name = "Expression")
df$group <- factor(df$group, levels=c('High','Low'))
plot <- ggplot(df,aes(x=immuneCells, y=Expression, fill=group))+
  geom_boxplot(aes(fill=group), outlier.shape = NA)+
  geom_jitter(size=0.1, alpha=0.2, position=position_jitter(0.3))+ 
  #geom_signif(comparisons = compaired, map_signif_level = T)+
  stat_compare_means(label = "p.signif", method = "t.test", label.x = 1.5, label.y.npc = "top", hide.ns = TRUE, col="red")+
  theme_pubclean() +
  scale_x_discrete(name="")+  
  theme(plot.title = element_text(size = 13, face = "bold"),
        legend.position = "right",
        text = element_text(size = 13),
        axis.title = element_text(face="bold"),
        axis.text.x=element_text(size = 12, angle = 45, hjust = 1.0, vjust = 1.0)) 
plot

4. 套路四:以单基因高低表达分组,limma差异分析

df <- exprs["Your Gene",]
df <- as.data.frame(t(df))
df$group <- ifelse(df$Your Gene > median(df$Your Gene), "High", "Low")
table(df$group)

library(limma)
design.factor <- factor(df$group, levels=c('Low','High')) 
design.matrix <- model.matrix(~0+design.factor)
colnames(design.matrix) <- levels(design.factor)
design.matrix

fit <- lmFit(exprs, design.matrix)
cont.matrix <- makeContrasts(High-Low, levels=design.matrix)
fit2 <- contrasts.fit(fit, cont.matrix)
fit2 <- eBayes(fit2)
DEG <- topTable(fit2,adjust="fdr",sort.by="B",number=Inf)
head(DEG)
DEG_filter <-subset(DEG, abs(logFC)>=1 & adj.P.Val<0.05)
dim(DEG_filter)

5. 套路五:GO、KEGG差异分析

library(DOSE)
library(GO.db)
library(org.Hs.eg.db)
library(GSEABase)
library(clusterProfiler)

## symbol ID transform to entre id
symbol=as.character(rownames(DEG_filter)) 
eg = bitr(symbol, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
id = as.character(eg[,2])
head(id)

## MF
egomf <- enrichGO(gene = id,
                  OrgDb = org.Hs.eg.db,
                  ont = "MF",
                  pAdjustMethod = "BH",
                  pvalueCutoff = 0.05,
                  qvalueCutoff = 0.05,
                  readable = TRUE)
barplot(egomf, showCategory=15)
dotplot(egomf)
emapplot(egomf)
write.csv(egomf@result,"egomf.csv") 

## CC
egocc <- enrichGO(gene = id,
                  OrgDb = org.Hs.eg.db,
                  ont = "CC",
                  pAdjustMethod = "BH",
                  pvalueCutoff = 0.05,
                  qvalueCutoff = 0.05,
                  readable = TRUE)
barplot(egocc, showCategory=15)
dotplot(egocc)
write.csv(egocc@result,"egocc.csv") 

## BP
egobp <- enrichGO(gene = id,
                  OrgDb = org.Hs.eg.db,
                  ont = "BP",
                  pAdjustMethod = "BH",
                  pvalueCutoff = 0.05,
                  qvalueCutoff = 0.05,
                  readable = TRUE)
barplot(egobp, showCategory=15)
dotplot(egobp)
emapplot(egobp)
write.csv(egobp@result,"egobp.csv") 

kk <- enrichKEGG(gene = id,
                 organism = 'hsa',
                 pvalueCutoff = 0.05,
                 pAdjustMethod = "BH",
                 qvalueCutoff = 0.05)
barplot(kk, showCategory=15)
dotplot(kk)
kk <- setReadable(kk, OrgDb = org.Hs.eg.db, keyType="ENTREZID")
write.csv(kk@result,"kegg.csv")
browseKEGG(kk, 'hsa04972')

6. 套路六:GSEA

6.1 对limma差异分析的结果,进行整理

symbol=as.character(rownames(DEG_filter)) 
eg = bitr(symbol, fromType="SYMBOL", toType="ENTREZID", OrgDb="org.Hs.eg.db")
id = as.character(eg[,2])
head(id)
is.data.frame(eg)
gene <- dplyr::distinct(eg, SYMBOL,.keep_all=TRUE)

DEG_filter$SYMBOL <- rownames(DEG_filter)
data_all_sort <- DEG_filter %>% 
  dplyr::inner_join(gene, by="SYMBOL") %>%
  arrange(desc(logFC))
head(data_all_sort)

geneList = data_all_sort$logFC #把foldchange按照从大到小提取出来
names(geneList) <- data_all_sort$ENTREZID #给上面提取的foldchange加上对应上ENTREZID
head(geneList)

ge = DEG_filter$logFC
names(ge) = rownames(DEG_filter)
ge = sort(ge,decreasing = T)
head(ge)

6.2 GSEA

library(msigdbr)
GO_df = msigdbr(species = "Homo sapiens",category = "C5") %>% 
  dplyr::select(gene_symbol,gs_exact_source,gs_subcat)
dim(GO_df)
length(unique(KEGG_df$gs_exact_source)) # 通路数量

# GSEA
library(GSVA)
em <- GSEA(ge, TERM2GENE = GO_df)
gseaplot2(em, geneSetID = 1, title = em$Description[1])

7. 套路七:OncoScore文本挖掘,评分

8. 套路八:与重要基因的相关性分析(如免疫检查点基因)

9. 套路九:与单细胞测序数据结合

年后分享!

10. 套路十:oncomine,TIME,GEPIA等在线网站的使用

前九件套小编主要用于非TCGA数据的分析。

最后编辑于
©著作权归作者所有,转载或内容合作请联系作者
  • 序言:七十年代末,一起剥皮案震惊了整个滨河市,随后出现的几起案子,更是在滨河造成了极大的恐慌,老刑警刘岩,带你破解...
    沈念sama阅读 214,904评论 6 497
  • 序言:滨河连续发生了三起死亡事件,死亡现场离奇诡异,居然都是意外死亡,警方通过查阅死者的电脑和手机,发现死者居然都...
    沈念sama阅读 91,581评论 3 389
  • 文/潘晓璐 我一进店门,熙熙楼的掌柜王于贵愁眉苦脸地迎上来,“玉大人,你说我怎么就摊上这事。” “怎么了?”我有些...
    开封第一讲书人阅读 160,527评论 0 350
  • 文/不坏的土叔 我叫张陵,是天一观的道长。 经常有香客问我,道长,这世上最难降的妖魔是什么? 我笑而不...
    开封第一讲书人阅读 57,463评论 1 288
  • 正文 为了忘掉前任,我火速办了婚礼,结果婚礼上,老公的妹妹穿的比我还像新娘。我一直安慰自己,他们只是感情好,可当我...
    茶点故事阅读 66,546评论 6 386
  • 文/花漫 我一把揭开白布。 她就那样静静地躺着,像睡着了一般。 火红的嫁衣衬着肌肤如雪。 梳的纹丝不乱的头发上,一...
    开封第一讲书人阅读 50,572评论 1 293
  • 那天,我揣着相机与录音,去河边找鬼。 笑死,一个胖子当着我的面吹牛,可吹牛的内容都是我干的。 我是一名探鬼主播,决...
    沈念sama阅读 39,582评论 3 414
  • 文/苍兰香墨 我猛地睁开眼,长吁一口气:“原来是场噩梦啊……” “哼!你这毒妇竟也来了?” 一声冷哼从身侧响起,我...
    开封第一讲书人阅读 38,330评论 0 270
  • 序言:老挝万荣一对情侣失踪,失踪者是张志新(化名)和其女友刘颖,没想到半个月后,有当地人在树林里发现了一具尸体,经...
    沈念sama阅读 44,776评论 1 307
  • 正文 独居荒郊野岭守林人离奇死亡,尸身上长有42处带血的脓包…… 初始之章·张勋 以下内容为张勋视角 年9月15日...
    茶点故事阅读 37,087评论 2 330
  • 正文 我和宋清朗相恋三年,在试婚纱的时候发现自己被绿了。 大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
    茶点故事阅读 39,257评论 1 344
  • 序言:一个原本活蹦乱跳的男人离奇死亡,死状恐怖,灵堂内的尸体忽然破棺而出,到底是诈尸还是另有隐情,我是刑警宁泽,带...
    沈念sama阅读 34,923评论 5 338
  • 正文 年R本政府宣布,位于F岛的核电站,受9级特大地震影响,放射性物质发生泄漏。R本人自食恶果不足惜,却给世界环境...
    茶点故事阅读 40,571评论 3 322
  • 文/蒙蒙 一、第九天 我趴在偏房一处隐蔽的房顶上张望。 院中可真热闹,春花似锦、人声如沸。这庄子的主人今日做“春日...
    开封第一讲书人阅读 31,192评论 0 21
  • 文/苍兰香墨 我抬头看了看天上的太阳。三九已至,却和暖如春,着一层夹袄步出监牢的瞬间,已是汗流浃背。 一阵脚步声响...
    开封第一讲书人阅读 32,436评论 1 268
  • 我被黑心中介骗来泰国打工, 没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留,地道东北人。 一个月前我还...
    沈念sama阅读 47,145评论 2 366
  • 正文 我出身青楼,却偏偏与公主长得像,于是被迫代替她去往敌国和亲。 传闻我的和亲对象是个残疾皇子,可洞房花烛夜当晚...
    茶点故事阅读 44,127评论 2 352