2024-09-27通过拟合线性回归模型lm()实现代谢组数据和转录组数据的简单关联

1. 代谢组数据示例(test_meta.txt)

SampleID mws1401

A13 0.00

A16 0.00

A7 0.00

A17 0.00

A11 0.00

A3 0.00

A12 0.00

A20 0.00

A19 0.00

A15 5.98

A14 5.32

A9 0.00

A8 6.01

A5 6.04

2.基因表达谱数据示例(test_gene.txt)

GeneID A13 A16 A7 A17 A11 A3 A12 A20 A19 A15 A14 A9 A8 A5

Gene1 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 5.98 5.32 0.00 6.01 6.04

Gene2 5.59 5.71 5.74 5.49 5.62 5.79 5.70 6.02 5.94 5.52 5.72 5.43 5.87 5.95

Gene3 5.26 5.96 6.49 5.58 5.97 6.58 6.10 6.72 6.51 6.52 6.37 5.87 6.50 6.92

Gene4 5.62 5.51 4.36 5.91 5.27 5.89 4.94 6.62 5.38 5.95 5.82 5.44 5.91 6.64

Gene5 4.35 4.35 4.85 4.02 4.49 5.02 4.50 4.87 4.71 5.13 5.28 5.20 4.88 5.27

脚本:

# 读取数据

meta_file <- "test_meta.txt"

gene_exp <- "test_gene.txt"

output <- "test_result.txt"

# 读取数据并进行转置

meta_data <- as.data.frame(read.table(file = meta_file, header = TRUE, row.names = 1, sep = "\t"))

gene_data <- as.data.frame(read.table(file = gene_exp, header = TRUE, row.names = 1))

gene_data <- t(gene_data)

# 获取列名

meta.names <- colnames(meta_data)[1:length(colnames(meta_data))]

gene.names <- colnames(gene_data)[1:length(colnames(gene_data))]

# 初始化空数据框 out

out <- data.frame(matrix(ncol = 4, nrow = 0))

colnames(out) <- c("meta", "gene", "cor", "p")

# 数据标准化

meta_data_scaled <- as.data.frame(scale(meta_data))

gene_data_scaled <- as.data.frame(scale(gene_data))

# 循环计算相关性

for (gene in gene.names) {

  dd <- data.frame(gene1 = gene_data_scaled[, gene], meta_data_scaled)


  for (meta in meta.names) {

    form <- paste(meta, "~ gene1", sep = " ")

    form <- as.formula(form)


    lm.result <- lm(form, dd)


    cor <- as.numeric(summary(lm.result)$coefficients[, 'Estimate'][2])

    p <- as.numeric(summary(lm.result)$coefficients[, 'Pr(>|t|)'][2])


    tmp <- data.frame(meta = meta, gene = gene, cor = cor, p = p)

    out <- rbind(out, tmp)

  }

}

# 写入结果

write.table(out, file = output, quote = FALSE, sep = "\t")


还可以添加群体结构(PCA结果作为其他自变量/协变量)。下面脚本中没做数据标准化处理,表头可能也有问题。

meta_file <- "test_meta.txt"

gene_exp <- "test_gene.txt"

pca_cor = read.table("pca_sort.txt",header=T,sep=" ")

output <- "test_result.txt"

meta_data=as.data.frame(read.table(file=meta_file,header=T,row.names=1,sep="\t"))

gene_data=as.data.frame(read.table(file=gene_exp,header=T,row.names=1))

meta.names=colnames(meta_data)[1:length(colnames(meta_data))]

gene.names=colnames(gene_data)[1:length(colnames(gene_data))]

out=c("meta","Gene","cor","p")

for (gene in gene.names) {

  dd=data.frame(gene1 = gene_data[,gene], pc1 = pca_cor$PC1, pc2 = pca_cor$PC2, pc3 = pca_cor$PC3, meta_data[,1:length(colnames(meta_data))])

for(meta in meta.names) {

  form <- paste(meta,"~ gene1+pc1+pc2+pc3",sep=" ")

  form <- as.formula(form)

  print(form)

  lm.result <- lm(form,dd)

  cor=as.numeric(summary(lm.result)$coefficients[,'Estimate'][2])

  p=as.numeric(summary(lm.result)$coefficients[,'Pr(>|t|)'][2])

  tmp=cbind(meta,gene,cor,p)

  out=rbind(out,tmp)


}

}

write.table(out,file=output,quote=FALSE,sep="\t")

©著作权归作者所有,转载或内容合作请联系作者
平台声明:文章内容(如有图片或视频亦包括在内)由作者上传并发布,文章内容仅代表作者本人观点,简书系信息发布平台,仅提供信息存储服务。

推荐阅读更多精彩内容