sudo apt-get install zlib1g zlib1g.dev libblas3 libgfortran5 liblapack3 libquadmath0 plink1.9 unzip
sudo apt install dirmngr gnupg apt-transport-https ca-certificates software-properties-common
sudo apt install r-base
1.获取或者生成基础数据(base data)
Polygenic Risk Score (PRS) 分析第一步就是获得基础数据(即GWAS统计分析结果),应该包含了与性状相关的所有等位基因信息及对应效应贡献.
CHR: The chromosome in which the SNP resides---位于第几条染色体
BP: Chromosomal co-ordinate of the SNP---位于染色体物理位置,可能出现的形式:POSITION
SNP: SNP ID, usually in the form of rs-ID---SNP编号,通常是rsID
A1: The effect allele of the SNP---效应等位基因,可能出现的形式:REF
A2: The non-effect allele of the SNP---非效应等位基因,可能出现的形式:ALT
N: Number of samples used to obtain the effect size estimate---用于评估效量值的群体数量
SE: The standard error (SE) of the effect size esimate---所评估效应量值的标准误差
P: The P-value of association between the SNP genotypes and the base phenotype---所评估表型和基因型的相关性p值,可能出现的形式:p_value
OR: The effect size estimate of the SNP, if the outcome is binary/case-control. If the outcome is continuous or treated as continuous then this will usually be BETA---所评基因型的效应量,Odds Ratio或 Effect Size
INFO: The imputation information score---插补得分,可能出现的形式:INFO.plink
MAF: The minor allele frequency (MAF) of the SNP ---所评基因型的效应量,可能出现的形式:ALT_FREQ、FRQ
我是从网上下载了别人的GWAS结果,是个TXT,目的是要将数据从以下排序换成第二行所示:
Chromosome Position RSID REF ALT ALT_FREQ ALT_FREQ_1KGASN RSQ INFO HWE_P Pvalue Qvalue N NullLogLike AltLogLike SNPWeight SNPWeightSE OddsRatio WaldStat NullLogDelta NullGeneticVar NullResidualVar NullBias
CHR BP RSID A1 A2 N SE P OR info MAF
1.1 数据排序
首先把TXT转换成CSV,再用PYTHON提取排序后再把CSV转换成TXT
TXT转CSV
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 13 11:25:52 2022
@author: Bohan
"""
import pandas as pd
df = pd.read_csv("MDD.10640samples.dosages.hwe6info9maf5.logreg.2017.txt",delimiter="\t", low_memory=False)
df.to_csv("MDD.10640samples.dosages.hwe6info9maf5.logreg.2017.csv", encoding='utf-8', index=False)
CSV提取排序
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 13 10:49:59 2022
@author: Bohan
"""
import csv
with open("MD_GWAS_SNPresults-original.csv") as f, open("originalarranged.csv","w",newline='') as tmp:
r = csv.reader(f)
wr = csv.writer(tmp)
wr.writerows([a,b,c,d,e,m,q,k,r,i,f] for [a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w] in r)
#[a,b,c,d,e,f,g,h,i,j,k,l,m,n,o,p,q,r,s,t,u,v,w] 是原来文件的列顺序
#[a,b,c,d,e,m,q,k,r,i,f]是按照原顺序提取重排
CSV转TXT
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 13 12:07:58 2022
@author: Bohan
"""
import csv
a=open('MD_GWAS_SNPresults-original.arranged.csv','r')
reader = csv.reader(a)
with open('MD_GWAS_SNPresults-original.arranged','w') as f:
for i in reader:
for x in i:
f.write(x)
f.write('\t')
f.write('\n')
a.close()
搞完base数据看起来这样
CHR BP RSID A1 A2 N SE P OR info MAF
10 90127 rs185642176 C T 10640 0.004802742809305182 0.4704864679671288 1.0123502866538827 0.905519504189 0.046
10 90164 rs141504207 C G 10640 0.004802742809305182 0.4704864679671288 1.0123502866538827 0.905515996565 0.046
10 94026 rs10904032 G A 10640 0.004807929547222263 0.9907936601472477 1.000113527128114 0.946752744368 0.145
1.2 base数据质检
解压读取MD_GWAS_SNPresults-original.2015.arranged.txt.gz
输出文件头 (NR==1)
输出MAF大于0.01的数据行 (第11列是MAF)
输出INFO大于0.8 的数据行(第10列是INFO)
压缩结果到MD2015.gz
gunzip -c MD_GWAS_SNPresults-original.2015.arranged.txt.gz |\
awk 'NR==1 || ($11 > 0.01) && ($10 > 0.8) {print}' |\
gzip > MD2015.gz
根据第3列的数据去重获得MD2015.nodup.gz(No-duplicate)
gunzip -c MD2015.gz |\
awk '{seen[$3]++; if(seen[$3]==1){ print}}' |\
gzip - > MD2015.nodup.gz
根据第四第五行数值去除Ambiguous SNPs
gunzip -c MD2015.nodup.gz |\
awk '!( ($4=="A" && $5=="T") || \
($4=="T" && $5=="A") || \
($4=="G" && $5=="C") || \
($4=="C" && $5=="G")) {print}' |\
gzip > MD2015.QC.gz
2.目标数据(target data)的质检QC
本案例用的是illumina ASA v1 SNP芯片检出的原始文件(*.idat文件)
直接去下载illumina的genomestudio软件
https://files.softwaredownloads.illumina.com/5831d9df-95cb-4427-a7c0-499fe871e1d5/genomestudio-software-v2-0-5-0-installer.zip
2.1 创建分析项目
要用的还有Infinium Asian Screening Array v1.0 Manifest File (BPM Format - GRCh37)
Infinium Asian Screening Array v1.0 Cluster File
2.2 评估参照数据
2.3 评估样品和SNP
具体理论可以参考这个GSA/ASA芯片质控 - 简书 (jianshu.com)
样品评估
Call Rate(检出率) 样本检出率: 是指对于某种样本而言,通过测序并成功判刑的snp与所有检出的snp的比值,通常标准在90%或以上。
LogR Dev用于评估是否有样品污染
SNP评估
Call Frequency(检出频率)用于SNP检出的样品覆盖程度
GenTrain Scoreillumina自己的算法
样品评估
Call Rate太低的不要
SNP评估
基本流程就是创建样品-添加manifest信息,参考基因组按照GWAS对应版本,利用自带插件plink-input-report-plugin-v2-1-4到处.ped和.map文件
plink1.9 --file new --out new --make-bed
plink1.9 --bfile new --maf 0.01 --hwe 1e-6 --geno 0.02 --mind 0.02 --write-snplist --make-just-fam --out new.QC
plink1.9 --bfile new --keep new.QC.fam --extract new.QC.snplist --indep-pairwise 200 50 0.25 --out new.QC
plink1.9 --bfile new --extract new.QC.prune.in --keep new.QC.fam --het --out new.QC
sudo R
install.packages("data.table")
library(data.table)
dat <- fread("new.QC.het")
valid <- dat[F<=mean(F)+3*sd(F) & F>=mean(F)-3*sd(F)]
fwrite(valid[,c("FID","IID")], "new.valid.sample", sep="\t")
install.packages("magrittr")
install.packages("R.utils")
library(magrittr)
bim <- fread("new.bim") %>%
setnames(., colnames(.), c("CHR", "SNP", "CM", "BP", "B.A1", "B.A2")) %>%
.[,c("B.A1","B.A2"):=list(toupper(B.A1), toupper(B.A2))]
MD <- fread("MD2015.QC.gz") %>%
.[,c("A1","A2"):=list(toupper(A1), toupper(A2))]
qc <- fread("new.QC.snplist", header=F)
info <- merge(bim, MD, by=c("SNP", "CHR", "BP")) %>%
.[SNP %in% qc[,V1]]
complement <- function(x){
switch (x,
"A" = "T",
"C" = "G",
"T" = "A",
"G" = "C",
return(NA)
)
info.match <- info[A1 == B.A1 & A2 == B.A2, SNP]
com.snps <- info[sapply(B.A1, complement) == A1 &
sapply(B.A2, complement) == A2, SNP]
bim[SNP %in% com.snps, c("B.A1", "B.A2") :=
list(sapply(B.A1, complement),
sapply(B.A2, complement))]
recode.snps <- info[B.A1==A2 & B.A2==A1, SNP]
bim[SNP %in% recode.snps, c("B.A1", "B.A2") :=
list(B.A2, B.A1)]
com.recode <- info[sapply(B.A1, complement) == A2 &
sapply(B.A2, complement) == A1, SNP]
bim[SNP %in% com.recode, c("B.A1", "B.A2") :=
list(sapply(B.A2, complement),
sapply(B.A1, complement))]
fwrite(bim[,c("SNP", "B.A1")], "EUR.a1", col.names=F, sep="\t")
mismatch <- bim[!(SNP %in% info.match |
SNP %in% com.snps |
SNP %in% recode.snps |
SNP %in% com.recode), SNP]
write.table(mismatch, "EUR.mismatch", quote=F, row.names=F, col.names=F)
q()
plink1.9 --bfile new --extract new.QC.prune.in --keep new.valid.sample --check-sex --out new.QC
valid <- fread("new.valid.sample")
dat <- fread("new.QC.sexcheck")[FID%in%valid$FID]
fwrite(dat[STATUS=="OK",c("FID","IID")], "new.QC.valid", sep="\t")
q() # exit R
plink1.9 --bfile new --extract new.QC.prune.in --keep new.QC.valid --rel-cutoff 0.125 --out new.QC
plink1.9 --bfile new --make-bed --keep new.QC.rel.id --out new.QC --extract new.QC.snplist
-----------------------------------------------------------------------------
library(data.table)
dat <- fread("MD2015.QC.gz")
fwrite(dat[,BETA:=log(OR)], "new.QC.Transformed", sep="\t")
q() # exit R
plink1.9 --bfile new.QC --clump-p1 1 --clump-r2 0.1 --clump-kb 250 --clump new.QC.Transformed --clump-snp-field SNP --clump-field P --out new
awk 'NR!=1{print $3}' new.clumped > new.valid.snp
awk '{print $3,$8}' new.QC.Transformed > new.pvalue
echo "0.001 0 0.001" > range_list
echo "0.05 0 0.05" >> range_list
echo "0.1 0 0.1" >> range_list
echo "0.2 0 0.2" >> range_list
echo "0.3 0 0.3" >> range_list
echo "0.4 0 0.4" >> range_list
echo "0.5 0 0.5" >> range_list
plink1.9 --bfile new.QC --score new.QC.Transformed 3 4 12 header --q-score-range range_list new.pvalue --extract new.valid.snp --out new
# First, we need to perform prunning
plink1.9 --bfile new.QC --indep-pairwise 200 50 0.25 --out new
# Then we calculate the first 6 PCs
plink1.9 --bfile new.QC --extract new.prune.in --pca 6 --out new
library(data.table)
library(magrittr)
p.threshold <- c(0.001,0.05,0.1,0.2,0.3,0.4,0.5)
phenotype <- fread("new.phenotype")
pcs <- fread("new.eigenvec", header=F) %>%
setnames(., colnames(.), c("FID", "IID", paste0("PC",1:6)) )
covariate <- fread("new.cov")
pheno <- merge(phenotype, covariate) %>%
merge(., pcs)
~~~~~
null.r2 <- summary(lm(Trait1~., data=pheno[,-c("FID", "IID")]))$r.squared
prs.result <- NULL
for(i in p.threshold){
pheno.prs <- paste0("new.", i, ".profile") %>%
fread(.) %>%
.[,c("FID", "IID", "SCORE")] %>%
merge(., pheno, by=c("FID", "IID"))
model <- lm(Trait1~., data=pheno.prs[,-c("FID","IID")]) %>%
summary
model.r2 <- model$r.squared
prs.r2 <- model.r2-null.r2
prs.coef <- model$coeff["SCORE",]
prs.result %<>% rbind(.,
data.frame(Threshold=i, R2=prs.r2,
P=as.numeric(prs.coef[4]),
BETA=as.numeric(prs.coef[1]),
SE=as.numeric(prs.coef[2])))
}
print(prs.result[which.max(prs.result$R2),])
q() # exit R
p.threshold <- c(0.001,0.05,0.1,0.2,0.3,0.4,0.5)
Read in the phenotype file
phenotype <- read.table("new.phenotype", header=T)
Read in the PCs
pcs <- read.table("new.eigenvec", header=F)
The default output from plink does not include a header
To make things simple, we will add the appropriate headers
(1:6 because there are 6 PCs)
colnames(pcs) <- c("FID", "IID", paste0("PC",1:6))
Read in the covariates (here, it is sex)
covariate <- read.table("new.cov", header=T)
Now merge the files
pheno <- merge(merge(phenotype, covariate, by=c("FID", "IID")), pcs, by=c("FID","IID"))
We can then calculate the null model (model with PRS) using a linear regression
(as height is quantitative)
null.model <- glm(Trait1~., data=pheno[,!colnames(pheno)%in%c("FID","IID")])
And the R2 of the null model is
null.r2 <- summary(null.model)r.squared
# R2 of PRS is simply calculated as the model R2 minus the null R2
prs.r2 <- model.r2-null.r2
# We can also obtain the coeffcient and p-value of association of PRS as follow
prs.coef <- summary(model)$coeff["SCORE",]
prs.beta <- as.numeric(prs.coef[1])
prs.se <- as.numeric(prs.coef[2])
prs.p <- as.numeric(prs.coef[4])
# We can then store the results
prs.result <- rbind(prs.result, data.frame(Threshold=i, R2=prs.r2, P=prs.p, BETA=prs.beta,SE=prs.se))
}
Best result is:
prs.result[which.max(prs.result$R2),]
q() # exit R