机器学习部分
#Description
#The core R codes for statistical analyses of manuscript gut jnl-2019-318860
#If you have any question of our source code, please contact me at yuchen_li@foxmail.com
#>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
#The tsne method for Dimension reduction and visualization
library(Rtsne)
tsne <- Rtsne(matrix, dims=3, pca = F, initial_dims = 10, perplexity = 100, max_iter = 500)
#the lasso and random forest algorithm for exLR-seq marker selection
library(glmnet)
lasso.results <- c()
for (j in 1:1000) {
n1 <- n[sample(1:nrow(n),ceiling(dim(n)[1]*0.5),replace=F),]
h1 <- h[sample(1:nrow(h),ceiling(dim(h)[1]*0.5),replace=F),]
s <- rbind.data.frame(n1,h1)
ts <- apply(s,2,as.numeric)
y <- as.matrix(ts[,1])
x <- as.matrix(ts[,2:dim(ts)[1]])
cv.fit <- cv.glmnet(x,y,family="binomial", type.measure = "auc", nfolds=5)
co<-coef.cv.glmnet(cv.fit,s="lambda.1se")
name <- rownames(co)[co[,1]!=0]
lasso.results <- c(lasso.results, name)
}
freq.lasso.results <- table(lasso.results)
library(varSelRF)
facy <- factor(training.data$status)
x <- training.data[,-1]
step=varSelRF(x, facy, c.sd = 1, mtryFactor = 1, ntree = 500,
ntreeIterat = 500, vars.drop.num = NULL, vars.drop.frac = 0.1, #
whole.range = TRUE, recompute.var.imp = FALSE, verbose = FALSE,
returnFirstForest = TRUE, fitted.rf = NULL, keep.forest = FALSE)
select.history <- step$selec.history
#The training diganostic model construction and SVM algorithm operation
set.seed(1)
library(pROC)
library(caret)
library(e1071)
library(kernlab)
fitControl <- trainControl(method = "repeatedcv",
number = 5,
repeats = 10,
## Estimate class probabilities
classProbs = TRUE,
## Evaluate performance using
## the following function
summaryFunction = twoClassSummary)
svmFit <- train(status ~ ., data = training.data,
method = "svmRadial",
trControl = fitControl,
preProc = c("center", "scale"),
tuneLength = 8,
metric = "ROC")
#The establishment of d-signature from three cohort
train.pred <- predict(svmFit,training.data)
validation.pred <- predict(svmFit,validation.data)
testing.pred <- predict(svmFit,testing.data)
#The caculation of diagnostic efficacy of d-signature
train.con <- confusionMatrix(train.pred, training.data$status)
validation.con <- confusionMatrix(validation.pred, validation.data$status)
testing.con <- confusionMatrix(testing.pred, testing.data$status)
tr.acc <- train.con$overall[1]
tr.se <- train.con$byClass[1]
tr.sp <- train.con$byClass[2]
va.acc <- validation.con$overall[1]
va.se <- validation.con$byClass[1]
va.sp <- validation.con$byClass[2]
te.acc <- testing.con$overall[1]
te.se <- testing.con$byClass[1]
te.sp <- testing.con$byClass[2]
#The Normfinder method for RT-qPCR
source("r.NormOldStab5.txt")
Result=Normfinder("Datafile.txt",ctVal=FALSE)
selected.exLRs.normfinder.results <- Result$Ordered
性别和年龄调整部分
####### Differential gene expression analysis using a linear mixed-effects model
load("./datExpr.rda") # Load expression data (normalized for library size and technical variables, log2 transformed)
load("./datTraits.rda") # Load metadata
library(nlme)
# Fixed effects:
Diagnosis = as.factor(datTraits$Diagnosis)
Diagnosis = relevel(Diagnosis,ref = "CTL") # Using control as reference level
Age = as.numeric(datTraits$Age)
Sex = as.factor(datTraits$Sex)
Region = as.factor(datTraits$Region)
# Random effect:
BrainID = as.factor(datTraits$Brain_ID)
DE_Diagnosis = data.frame(miRID = NA,Beta = NA,StdErr = NA,Pval = NA)
DE_Age = data.frame(miRID = NA,Beta = NA,StdErr = NA,Pval = NA)
DE_Sex = data.frame(miRID = NA,Beta = NA,StdErr = NA,Pval = NA)
DE_Region = data.frame(miRID = NA,Beta = NA,StdErr = NA,Pval = NA)
# Define a function that runs the linear mixed-effects model and extract Beta values, standard errors, and P values for the fixed effects
runlme = function(miR) {
lme = lme(miR~Diagnosis + Age + Sex + Region, rand = ~1|BrainID,na.action = na.exclude)
slme = summary(lme);
Beta_Diagnosis = slme$tTable[2,1]
Beta_Age = slme$tTable[3,1]
Beta_Sex = slme$tTable[4,1]
Beta_Region = slme$tTable[5,1]
StdErr_Diagnosis = slme$tTable[2,2]
StdErr_Age = slme$tTable[3,2]
StdErr_Sex = slme$tTable[4,2]
StdErr_Region = slme$tTable[5,2]
Pval_Diagnosis = slme$tTable[2,5]
Pval_Age = slme$tTable[3,5]
Pval_Sex = slme$tTable[4,5]
Pval_Region = slme$tTable[5,5]
LME_Diagnosis = list(Beta = Beta_Diagnosis,StdErr = StdErr_Diagnosis,Pval = Pval_Diagnosis)
LME_Age = list(Beta = Beta_Age,StdErr = StdErr_Age,Pval = Pval_Age)
LME_Sex = list(Beta = Beta_Sex,StdErr = StdErr_Sex,Pval = Pval_Sex)
LME_Region = list(Beta = Beta_Region,StdErr = StdErr_Region,Pval = Pval_Region)
return(list(LME_Diagnosis = LME_Diagnosis,LME_Age = LME_Age,LME_Sex = LME_Sex,LME_Region = LME_Region));
}
# Run the linear mixed-effects model for all miRNAs in the data set
for (i in 1:nrow(datExpr)){
miR = as.numeric(datExpr[i,])
result = try(runlme(miR),silent = F)
if (length(result) == 4) {
Beta_Diagnosis = result[[1]]$Beta
StdErr_Diagnosis = result[[1]]$StdErr
Pval_Diagnosis = result[[1]]$Pval
Beta_Age = result[[2]]$Beta
StdErr_Age = result[[2]]$StdErr
Pval_Age = result[[2]]$Pval
Beta_Sex = result[[3]]$Beta
StdErr_Sex = result[[3]]$StdErr
Pval_Sex = result[[3]]$Pval
Beta_Region = result[[4]]$Beta
StdErr_Region = result[[4]]$StdErr
Pval_Region = result[[4]]$Pval
} else {
cat('Error in LME for microRNA',rownames(datExpr)[i],'\n')
cat(' Setting Beta value=0, SE=Inf, and P-value=1\n')
Beta_Diagnosis = Beta_Age = Beta_Sex = Beta_Region = 0
StdErr_Diagnosis = StdErr_Age = StdErr_Sex = StdErr_Region = Inf
Pval_Diagnosis = Pval_Age = Pval_Sex = Pval_Region = 1
}
DE_Diagnosis[i,"miRID"] = rownames(datExpr)[i]; DE_Diagnosis[i,"Beta"] = Beta_Diagnosis; DE_Diagnosis[i,"StdErr"] = StdErr_Diagnosis; DE_Diagnosis[i,"Pval"] = Pval_Diagnosis
DE_Age[i,"miRID"] = rownames(datExpr)[i]; DE_Age[i,"Beta"] = Beta_Age; DE_Age[i,"StdErr"] = StdErr_Age; DE_Age[i,"Pval"] = Pval_Age
DE_Sex[i,"miRID"] = rownames(datExpr)[i]; DE_Sex[i,"Beta"] = Beta_Sex; DE_Sex[i,"StdErr"] = StdErr_Sex; DE_Sex[i,"Pval"] = Pval_Sex
DE_Region[i,"miRID"] = rownames(datExpr)[i]; DE_Region[i,"Beta"] = Beta_Region; DE_Region[i,"StdErr"] = StdErr_Region; DE_Region[i,"Pval"] = Pval_Region
}
# Perform FDR adjustment using Benjamini-Hochberg correction
DE_Diagnosis$adjPval = p.adjust(DE_Diagnosis$Pval,method = "BH",n = nrow(DE_Diagnosis))
DE_Age$adjPval = p.adjust(DE_Age$Pval,method = "BH",n = nrow(DE_Age))
DE_Sex$adjPval = p.adjust(DE_Sex$Pval,method = "BH",n = nrow(DE_Sex))
DE_Region$adjPval = p.adjust(DE_Region$Pval,method = "BH",n = nrow(DE_Region))
write.csv(DE_Diagnosis,file = "./DGE_Diagnosis.csv")
write.csv(DE_Age,file = "./DGE_Age.csv")
write.csv(DE_Sex,file = "./DGE_Sex.csv")
write.csv(DE_Region,file = "./DGE_BrainRegion.csv")
WGCNA部分(未使用)
####### WGCNA analysis using bootstrapping
library(WGCNA)
library(flashClust)
options(stringsAsFactors = FALSE)
dir.create("./Network_resampled")
dir.create("./TOM_resampled")
### 1. Construct network using the original sample set
load("./datExpr.rda") # Load expression data (normalized for library size and technical variables, log2 transformed)
load("./datTraits.rda") # Load metadata
# Pick soft threshold
powers = c(c(1:10), seq(from = 12, to = 20, by = 2))
sft = pickSoftThreshold(datExpr,networkType = "signed",corFnc = "bicor",verbose = 5,powerVector = powers)
softPower = 8 # Choose based on fit to scale-free topology
adjacency = adjacency(datExpr, corFnc = "bicor", type = "signed", power = softPower)
TOM = TOMsimilarity(adjacency,TOMType = "signed", verbose = 0)
dissTOM = 1-TOM
geneTree = flashClust(as.dist(dissTOM), method = "average")
minModuleSize = 10
ds = 3
cutHeight = 0.99999
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM, method = "hybrid",
deepSplit = ds, pamRespectsDendro = T,pamStage = T,
minClusterSize = minModuleSize, cutHeight = cutHeight)
dynamicColors = labels2colors(dynamicMods)
# Calculate eigengenes
MEList = moduleEigengenes(datExpr, colors = dynamicColors,softPower = softPower)
MEs = MEList$eigengenes
# Merge modules whose module eigengenes are highly correlated
MEDissThres = 0.15
merge = mergeCloseModules(datExpr, dynamicColors, cutHeight = MEDissThres, verbose = 3)
moduleColors = merge$colors
MEs = merge$newMEs
save(MEs, TOM, moduleColors, geneTree, file = "./Network.RData")
### 2. Perform bootstrapping
datTraits_ASD = datTraits[datTraits$Diagnosis == "ASD",]
datTraits_Ctrl = datTraits[datTraits$Diagnosis == "CTL",]
datExpr_ASD = datExpr[match(rownames(datTraits_ASD),rownames(datExpr)),]
datExpr_Ctrl = datExpr[match(rownames(datTraits_Ctrl),rownames(datExpr)),]
softPower = 8
ds = 3
cutHeight = 0.99999
minModuleSize = 10
nSets = 200
for (i in 1:nSets){
if (i%%10 == 0) { print(paste(i,"th resampling",sep = "")) }
# Resample within ASD and CTL groups
ASD_resample = sample(1:nrow(datExpr_ASD),size = nrow(datExpr_ASD),replace = T)
datExpr_ASD_resample = datExpr_ASD[ASD_resample,]
Ctrl_resample = sample(1:nrow(datExpr_Ctrl),size = nrow(datExpr_Ctrl),replace = T)
datExpr_Ctrl_resample = datExpr_Ctrl[Ctrl_resample,]
datExpr_resample = rbind(datExpr_ASD_resample,datExpr_Ctrl_resample)
adjacency = adjacency(datExpr_resample, corFnc = "bicor", type = "signed", power = softPower)
TOM = TOMsimilarity(adjacency,TOMType = "signed")
dissTOM = 1-TOM
geneTree = flashClust(as.dist(dissTOM), method = "average")
dynamicMods = cutreeDynamic(dendro = geneTree, distM = dissTOM, method="hybrid",
deepSplit = ds, pamRespectsDendro = T, pamStage=T,
minClusterSize = minModuleSize, cutHeight = cutHeight, verbose=0)
dynamicColors = labels2colors(dynamicMods)
MEList = moduleEigengenes(datExpr_resample, colors = dynamicColors,softPower = softPower)
MEs = MEList$eigengenes
# Merge modules whose module eigengenes are highly correlated
MEDissThres = 0.15
merge = mergeCloseModules(datExpr_resample, dynamicColors, cutHeight = MEDissThres, verbose = 0)
moduleColors = merge$colors
MEs = merge$newMEs
save(MEs, TOM, moduleColors, geneTree, file = paste("./Network_resampled/Network_resampled",i,".RData",sep = ""))
}
### 3. Calculate consensus TO matrix
nSets = 200
prob = 0.95
nSubset = 20000
load("./Network.RData")
TOM.mat = as.matrix(TOM)
nGenes = nrow(TOM.mat)
subsetEntries = sample(nGenes*(nGenes-1)/2, size = nSubset)
TOMsubset = list()
TOMsubset.main = vectorizeMatrix(TOM.mat)[subsetEntries]
quantile.TOM.main = quantile(TOMsubset.main,probs = prob,type = 8)
quantile.TOM = rep(1, nSets) ## vector of quantiles of the individual TO matrices
beta.prob = rep(1, nSets) ## Scaling powers to calibrate TOM values
for (set in 1:nSets) {
if (!file.exists(paste("./TOM_resampled/TOM_resampled",set,".RData",sep = ""))) {
print(paste("On resampled TOM ",set,"...",sep = ""))
load(paste("./Network_resampled/Network_resampled",set,".RData",sep = ""))
tmpTOM = as.matrix(TOM)
TOMsubset[[set]] = vectorizeMatrix(tmpTOM)[subsetEntries]
quantile.TOM[set] = quantile(TOMsubset[[set]],probs = prob,type = 8)
beta.prob[set] = log(quantile.TOM.main)/log(quantile.TOM[set]) ## calculate the power of the adjacency function
print(paste("Scaling power for this TOM is ",signif(beta.prob[set],3),sep = ""))
TOM = tmpTOM^(beta.prob[set]) ## use the power adjacency function for calibrating the TOM
save(TOM,file = paste("./TOM_resampled/TOM_resampled",set,".RData",sep = ""))
} else {
print(paste("./Network_resampled/Network_resampled",set,".RData",sep = ""))
print("Already processed!")
}
}
rm(TOM.mat,tmpTOM)
consTOM = vector("list",nSets)
for (set in 1:nSets) {
print(paste("loading TOM",set,sep = " "))
load(paste("./TOM_resampled/TOM_resampled",set,".RData",sep = ""))
consTOM[[set]]$TOM = TOM
}
print("Calculting the median quantile across the TOMs")
consensusTOM = matrix(NA,nrow = nrow(TOM),ncol = ncol(TOM))
for (i in 1:nrow(TOM) ) {
if (i%%100 == 0) { print(paste("On gene",i,sep = " ")) }
for (j in 1:ncol(TOM)) {
thisvec = vector(mode = "numeric",length = nSets)
for (k in 1:nSets){
thisvec[k]=consTOM[[k]]$TOM[i,j]
}
consensusTOM[i,j] = median(thisvec)
}
}
save(consensusTOM,file = "./TOM_resampled/ConsensusTOM.RData")
# Define modules based on consensus TO matrix
cons_dissTOM = 1-consensusTOM
cons_geneTree = flashClust(as.dist(cons_dissTOM), method = "average")
softPower = 8
ds = 3
cutHeight = 0.99999
minModuleSize = 10
cons_dynamicMods = cutreeDynamic(dendro = cons_geneTree, distM = cons_dissTOM, method = "hybrid",
deepSplit = ds, pamRespectsDendro = T, pamStage = T,
minClusterSize = minModuleSize, cutHeight = cutHeight, verbose = 0)
cons_dynamicColors = labels2colors(cons_dynamicMods)
MEDissThres = 0.15
cons_merge = mergeCloseModules(datExpr, cons_dynamicColors, cutHeight = MEDissThres, verbose = 0)
cons_moduleColors = cons_merge$colors
cons_moduleColors = matchLabels(cons_moduleColors, moduleColors, pThreshold = 5e-2) # Match consensus module colors with original module colors
cons_MEList = moduleEigengenes(datExpr, colors = cons_moduleColors, softPower = softPower)
cons_MEs = cons_MEList$eigengenes
save(cons_MEs, consensusTOM, cons_moduleColors, cons_geneTree, file = "ConsensusNetwork.RData")
# Calculate module membership
modNames=substring(names(cons_MEs),3)
geneModuleBicor=bicorAndPvalue(datExpr, cons_MEs, use = "p")
geneModuleMembership = as.data.frame(geneModuleBicor$bicor)
MMPvalue = as.data.frame(geneModuleBicor$p)
names(geneModuleMembership) = paste("MM", modNames, sep = "")
names(MMPvalue) = paste("p.MM", modNames, sep = "")
geneInfo = data.frame(miRID = colnames(datExpr), moduleColor = cons_moduleColors, geneModuleMembership, MMPvalue)
write.csv(geneInfo, file = "./ConsensusNetwork_ModuleMembership.csv")
参考文献:
- Plasma extracellular vesicle long RNA profiling identifies a diagnostic signature for the detection of pancreatic ductal adenocarcinoma.
- Extracellular Vesicles Long RNA Sequencing Reveals Abundant mRNA, circRNA, and lncRNA in Human Blood as Potential Biomarkers for Cancer Diagnosis.
- Genome-wide, integrative analysis implicates microRNA dysregulation in autism spectrum disorder