> rm(list = ls())
> load(file = "step2output.Rdata")
> #查看右侧变量
>
> #差异分析,用limma包来做
> #需要表达矩阵和Group,不需要改
> library(limma)
Warning message:
程辑包‘limma’是用R版本4.0.3 来建造的
> design=model.matrix(~Group)
> fit=lmFit(exp,design)
> fit=eBayes(fit)#贝叶斯检验
> deg=topTable(fit,coef=2,number = Inf)
> dim(deg)
[1] 54675 6
> head(deg)
logFC AveExpr t P.Value adj.P.Val B
224563_at -3.419608 10.462003 -16.58978 4.947714e-14 2.358899e-09 21.39728
208629_s_at -1.633518 11.204696 -16.14895 8.628803e-14 2.358899e-09 20.92048
212417_at -1.810946 8.760968 -15.16339 3.139295e-13 4.882577e-09 19.79803
201861_s_at -1.713884 11.965832 -15.06767 3.572073e-13 4.882577e-09 19.68466
208325_s_at -1.840898 9.321510 -13.60230 2.819698e-12 3.083340e-08 17.84515
203479_s_at -2.274611 7.651265 -13.43343 3.618635e-12 3.297481e-08 17.61994
> identical(rownames(deg),rownames(exp))#已按照p值列排序 不同了
[1] FALSE
> head(rownames(deg))
[1] "224563_at" "208629_s_at" "212417_at" "201861_s_at" "208325_s_at" "203479_s_at"
> head(rownames(exp))
[1] "1007_s_at" "1053_at" "117_at" "121_at" "1255_g_at" "1294_at"
>
> #为deg数据框添加几列
> #1.加probe_id列,把行名变成一列
> library(dplyr)
载入程辑包:‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
Warning message:
程辑包‘dplyr’是用R版本4.0.3 来建造的
> deg <- mutate(deg,probe_id=rownames(deg))
> head(deg)
logFC AveExpr t P.Value adj.P.Val B probe_id
1 -3.419608 10.462003 -16.58978 4.947714e-14 2.358899e-09 21.39728 224563_at
2 -1.633518 11.204696 -16.14895 8.628803e-14 2.358899e-09 20.92048 208629_s_at
3 -1.810946 8.760968 -15.16339 3.139295e-13 4.882577e-09 19.79803 212417_at
4 -1.713884 11.965832 -15.06767 3.572073e-13 4.882577e-09 19.68466 201861_s_at
5 -1.840898 9.321510 -13.60230 2.819698e-12 3.083340e-08 17.84515 208325_s_at
6 -2.274611 7.651265 -13.43343 3.618635e-12 3.297481e-08 17.61994 203479_s_at
> #2.加上探针注释
> #查看
> dim(ids)
[1] 41905 2
> head(ids)
probe_id symbol
1 1053_at RFC2
2 117_at HSPA6
3 121_at PAX8
4 1255_g_at GUCA1A
5 1316_at THRA
6 1320_at PTPN21
> table(!duplicated(ids$probe_id))#无重复的 第一列probe_id
TRUE
41905
> table(!duplicated(ids$symbol))#无重复的 第二列 symbol
FALSE TRUE
21744 20161
去重
按symbol列去重,常见标准有3个:最大值/平均值/随机去重
方法1.随机去重
> ids = ids[!duplicated(ids$symbol),]#谁排第一个就取谁
>
> deg <- inner_join(deg,ids,by="probe_id")#去交集
> head(deg)
logFC AveExpr t P.Value adj.P.Val B probe_id symbol
1 -1.633518 11.204696 -16.14895 8.628803e-14 2.358899e-09 20.92048 208629_s_at HADHA
2 -1.713884 11.965832 -15.06767 3.572073e-13 4.882577e-09 19.68466 201861_s_at LRRFIP1
3 -2.274611 7.651265 -13.43343 3.618635e-12 3.297481e-08 17.61994 203479_s_at OTUD4
4 -1.935657 8.628444 -13.04331 6.501205e-12 3.554534e-08 17.08858 203391_at FKBP2
5 -1.893808 8.896532 -12.38173 1.812689e-11 6.607251e-08 16.15073 205457_at ILRUN
6 -1.691681 10.302132 -12.00890 3.290947e-11 1.124578e-07 15.60094 201392_s_at IGF2R
> nrow(deg)
[1] 20161
方法2.求平均值
> #rm(list = ls())
> #load("step2output.Rdata")
> exp3 = exp[ids$probe_id,]
> exp3[1:4,1:4]
GSM1366348 GSM1366349 GSM1366350 GSM1366351
1053_at 8.932805 8.679543 8.625015 8.637085
117_at 9.383421 8.605809 9.462774 9.898573
121_at 7.916751 8.500635 8.258467 8.553656
1255_g_at 5.085221 2.414033 1.718570 4.311794
> # 有重复的基因,求平均值
> s = unique(ids$symbol[duplicated(ids$symbol)]);length(s)
[1] 10753
> exp4 = sapply(s, function(i){
+ colMeans(exp3[ids$symbol==i,])
+ })
> dim(exp4)
[1] 22 10753
> exp4 = t(exp4)
> dim(exp4)
[1] 10753 22