-
oligo
包官方介绍:https://bioconductor.org/packages/release/bioc/html/oligo.html -
oligo
包简单教程:https://kasperdanielhansen.github.io/genbioconductor/html/oligo.html -
示例数据:GSE159676
step1:下载Array芯片原始数据(CEL.gz))
-
https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE159676
以LTH与NASH组样本为例:
list.files("GSE159676_RAW/")
# [1] "GSM4837490_2010-03-26_LTH_1_HuGene-1_0-st-v1_.CEL.gz"
# [2] "GSM4837491_2010-03-26_LTH_2_HuGene-1_0-st-v1_.CEL.gz"
# [3] "GSM4837492_2010-04-01_LTH_3_HuGene-1_0-st-v1_.CEL.gz"
# [4] "GSM4837493_2010-04-01_LTH_4_HuGene-1_0-st-v1_.CEL.gz"
# [5] "GSM4837494_2010-04-01_LTH_5_HuGene-1_0-st-v1_.CEL.gz"
# [6] "GSM4837495_2010-04-01_LTH_6_HuGene-1_0-st-v1_.CEL.gz"
# [7] "GSM4837496_2010-03-26_NASH_1_HuGene-1_0-st-v1_.CEL.gz"
# [8] "GSM4837497_2010-04-01_NASH_2_HuGene-1_0-st-v1_.CEL.gz"
# [9] "GSM4837498_2010-04-01_NASH_3_HuGene-1_0-st-v1_.CEL.gz"
# [10] "GSM4837499_2010-04-01_NASH_4_HuGene-1_0-st-v1_.CEL.gz"
# [11] "GSM4837500_2010-04-01_NASH_5_HuGene-1_0-st-v1_.CEL.gz"
# [12] "GSM4837501_2010-04-15_NASH_6_HuGene-1_0-st-v1_.CEL.gz"
# [13] "GSM4837502_2010-04-15_NASH_7_HuGene-1_0-st-v1_.CEL.gz"
step2:加载R包,初步读取表达数据
library(oligo)
celfiles <- list.files("GSE159676_RAW", full = TRUE)
rawData <- read.celfiles(celfiles)
exprs(rawData)[1:4,1:3]
# GSM4837490_2010-03-26_LTH_1_HuGene-1_0-st-v1_.CEL.gz
# 1 2530
# 2 40
# 3 2435
# 4 55
# GSM4837491_2010-03-26_LTH_2_HuGene-1_0-st-v1_.CEL.gz
# 1 2576
# 2 42
# 3 2612
# 4 31
# GSM4837492_2010-04-01_LTH_3_HuGene-1_0-st-v1_.CEL.gz
# 1 2302
# 2 44
# 3 2333
# 4 48
summary(exprs(rawData)[, 1:3])
# GSM4837490_2010-03-26_LTH_1_HuGene-1_0-st-v1_.CEL.gz
# Min. : 20.0
# 1st Qu.: 35.0
# Median : 67.0
# Mean : 254.9
# 3rd Qu.: 181.0
# Max. :18087.0
# GSM4837491_2010-03-26_LTH_2_HuGene-1_0-st-v1_.CEL.gz
# Min. : 19.0
# 1st Qu.: 32.0
# Median : 52.0
# Mean : 175.4
# 3rd Qu.: 119.0
# Max. :13711.0
# GSM4837492_2010-04-01_LTH_3_HuGene-1_0-st-v1_.CEL.gz
# Min. : 20.0
# 1st Qu.: 35.0
# Median : 64.0
# Mean : 243.1
# 3rd Qu.: 167.0
# Max. :14906.0
step3:修改组别名信息
filename = sampleNames(rawData)
filename_split = stringr::str_split(filename, "_", simplify = TRUE)
sampleNames(rawData) = filename_split[,1]
pData(rawData)$GSM = filename_split[,1]
pData(rawData)$Group = filename_split[,3]
head(pData(rawData))
# index GSM Group
# GSM4837490 1 GSM4837490 LTH
# GSM4837491 2 GSM4837491 LTH
# GSM4837492 3 GSM4837492 LTH
# GSM4837493 4 GSM4837493 LTH
# GSM4837494 5 GSM4837494 LTH
# GSM4837495 6 GSM4837495 LTH
exprs(rawData)[1:4,1:3]
# GSM4837490 GSM4837491 GSM4837492
# 1 2530 2576 2302
# 2 40 42 44
# 3 2435 2612 2333
# 4 55 31 48
step4:表达数据标准化
normData <- rma(rawData)
# Background correcting
# Normalizing
# Calculating Expression
exprs(normData)[1:4,1:3]
# GSM4837490 GSM4837491 GSM4837492
# 7892501 2.829734 1.202127 2.415802
# 7892502 3.680496 3.178658 3.480709
# 7892503 2.693415 2.391527 2.351842
# 7892504 7.416071 7.063614 7.214720
class(normData)
# [1] "ExpressionSet"
# attr(,"package")
# [1] "Biobase"