在R语言运作之前,我们要去cellminer这个网站去下载药物数据和表达数据
「https://discover.nci.nih.gov/cellminer/home.do」
点击——>Download Data Sets——>Processed Data Set——>
接下来交给R:
setwd("/Users/jiangchengyang/Desktop/药敏实验")
getwd()
install.packages("readxl")
library(readxl)
dat1 <- read_excel(path = "DTP_NCI60_ZSCORE.xlsx", skip = 7)
colnames(dat1) <- dat1[1,]
dat1 <- dat1[-1,-c(67,68)]
# 筛选药物标准
table(dat1$`FDA status`)
# 选取经过临床试验(Clinical trial)和FDA批准(FDA approved)的药物结果
dat1 <- dat1[dat1$`FDA status` %in% c("FDA approved", "Clinical trial"),]
dat1 <- dat1[,-c(1, 3:6)]
ifelse(dir.exists("output"),FALSE,dir.create("output"))
write.table(dat1, file = "output/drug.txt",sep = "\t",row.names = F,quote = F)
###==============读入表达数据
dat2 <- read_excel(path = "RNA__RNA_seq_composite_expression.xls", skip = 9)
colnames(dat2) <- dat2[1,]
dat2 <- dat2[-1,-c(2:6)]
write.table(dat2, file = "geneExp.txt",sep = "\t",row.names = F,quote = F)
BiocManager::install("impute")
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("limma")
library(impute)
library(limma)
#读取药物输入文件
drugDat <- read.table("output/drug.txt",sep="\t",header=T,check.names=F,quote = "")
drugDat <- as.matrix(drugDat)
rownames(drugDat) <- drugDat[,1]
drug <- drugDat[,2:ncol(drugDat)]
dimnames <- list(rownames(drug),colnames(drug))
data <- matrix(as.numeric(as.matrix(drug)),nrow=nrow(drug),dimnames=dimnames)
# 考虑到药物敏感性数据中存在部分NA缺失值,通过impute.knn()函数来评估并补齐药物数据。其中,impute.knn()函数是一个使用最近邻平均来估算缺少的表达式数据的函数。
f<-function(x) sum(is.na(x))/length(x)*100 < 80
data <- data[,apply(data,2,f)]
mat <- impute.knn(data)
drug <- mat$data
install.packages("dplyr")
library(dplyr)
library(limma)
drug <- avereps(drug)
colnames(drug)[1:12]
# 读取表达输入文件
exp <- read.table("output/geneExp.txt", sep="\t", header=T, row.names = 1, check.names=F)
dim(exp)
# 提取特定基因表达
install.packages("WGCNA")
BiocManager::install("GO.db")
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
BiocManager::install("preprocessCore")
if (!requireNamespace("BiocManager", quietly = TRUE))
install.packages("BiocManager")
library(WGCNA)
install.packages("tidyr")
library(tidyr)
inputgene <- c("MSI1")#你也可以选多个基因
gl <- intersect(inputgene,row.names(exp))
exp <- exp[gl,]
exp <- exp[,colnames(drug)]
output <- data.frame()
for (gene in row.names(exp)) {
x <- as.numeric(exp[gene,])
for (Drug in row.names(drug)) {
y <- as.numeric(drug[Drug,])
res <- cor.test(x,y,method = "pearson")
cor <- res$estimate
p <- res$p.value
output <- rbind(output,cbind(gene,Drug,cor,p))
}
}
output$cor <- as.numeric(output$cor)
output$p <- as.numeric(output$p)
output$sig <- ifelse(output$p < 0.05 & abs(output$cor)>0.3,ifelse(output$cor > 0.3, " Pos","Neg") ,"No")
save(output,drug,exp, file = "output.RData")
输出的是一个R的语言环境,便于后续可视化的进行。
ps:代码部分是最新版本的R和Rstudio,因为老的版本老是出现R版本不适用的报错,有些代码是冗余的,是为了解决报错的问题。