原理和公式参考:
- https://www.jianshu.com/p/c7ba1aa543fe
- wikipedia: https://en.wikipedia.org/wiki/Tajima%27s_D
- https://www.google.com.hk/search?q=pvalue+for+tajima%27s+D&newwindow=1&client=safari&ei=URDsYZy_OZavoATIyJSYAg&ved=0ahUKEwjcq5ngxMX1AhWWF4gKHUgkBSMQ4dUDCA0&uact=5&oq=pvalue+for+tajima%27s+D&gs_lcp=Cgdnd3Mtd2l6EAMyBwgAEEcQsAMyBwgAEEcQsAMyBwgAEEcQsAMyBwgAEEcQsAMyBwgAEEcQsAMyBwgAEEcQsAMyBwgAEEcQsAMyBwgAEEcQsAMyBwgAEEcQsANKBAhBGABKBQhAEgExSgQIRhgAUABYAGCb0QRoAXABeACAAQCIAQCSAQCYAQDIAQnAAQE&sclient=gws-wiz
- https://zhuanlan.zhihu.com/p/52064863
1. 加载数据
##### load data
data<-read.table("mydata.genotype",header=T) #### change this
data<-data[data$Chr=="chr18"] #### select car
data1<-data[,4:dim(data)[2]]
2. 设定参数和两两个体的排列组合
nindiv<-dim(data1)[2]
nsnp<-dim(data1)[1]
pos_data<-data[,1:2] #### change this
pop1_num<-1:9 ###### change this (前9个是population1,后面是population2,
######这个只用于计算Fst,对tajima‘s D没有用)
pop2_num<-10:17 ##### change this
between_or_within_pop<-c()
for(i in 1:(nindiv-1)){
for(k in (i+1):nindiv){
print(paste0(i," and ",k))
if((i %in% pop1_num & k %in% pop2_num) | (i %in% pop2_num & k %in% pop1_num)){
between_or_within_pop<-c(between_or_within_pop,"between")
}else{between_or_within_pop<-c(between_or_within_pop,"within")}}
}
#### 17个个体,两两组合类型一共136种。
#### 写清楚哪些是群体间的,哪些是群体内的(Fst计算要用)
#### save between pop pairs num and within pop pair num
nbetween<-sum(between_or_within_pop=="between")
nwithin<-sum(between_or_within_pop=="within")
3. 定义计算函数
##### define function
tajima_and_fst<-function(input_data){
theta_matrix<-matrix(nrow=dim(input_data)[1],ncol=choose(nindiv,2),0)
for(snp in 1:dim(input_data)[1]){
array<-input_data[snp,]
pair_count=0
for(i in 1:(nindiv-1)){
for(k in (i+1):nindiv){
pair_count=pair_count+1
if(array[i][1,1]!=array[k][1,1]){
if (array[i][1,1]=="-"|array[k][1,1]=="-"){
theta_matrix[snp,pair_count]=NA
}else{theta_matrix[snp,pair_count]=1}
}
}
}
}
theta_matrix1<-theta_matrix
theta_matrix1[is.na(theta_matrix)]=0
#### theta_pi是考虑到NA值的,使用了总位点数/非gap位点数 作为标准化常数
nromalized_factor<-dim(theta_matrix1)[1]/apply(!is.na(theta_matrix),2,sum)
nromalized_factor[nromalized_factor==Inf]<-1
a1<-sum(1/1:(dim(input_data)[1]-1))
a2<-sum(1/(1:(dim(input_data)[1]-1))**2)
c1<-b1-1/a1
c2<-b2-((nindiv+2)/(a1*nindiv))+(a2/a1**2)
e1<-c1/a1
e2<-c2/(a1**2+a2)
S<-sum(apply(theta_matrix1,1,sum)!=0)
theta_pi<-sum(apply(theta_matrix1,2,sum)*nromalized_factor)/
(combination_times-sum(apply(!is.na(theta_matrix),2,sum)==0)
)
theta_w<-S/a1
var_d<-(e1*S+e2*S*(S-1))**(1/2)
tajimasd<-(theta_pi-theta_w)/var_d
theta_pi_between<-
sum(apply(theta_matrix1[,between_or_within_pop=="between"],2,sum)*nromalized_factor)/nbetween
theta_pi_within<-
sum(apply(theta_matrix1[,between_or_within_pop=="within"],2,sum)*nromalized_factor)/nbetween
fst<-(theta_pi_between-theta_pi_within)/theta_pi_between
return(list(tajimasd,fst))
}
4. 定义扫描基因组的sliding window的参数
##### define sliding window parameters
window_size=20 #### change this
step_wise=1 #### change this
combination_times<-choose(nindiv,2)
b1<-nindiv+1/3*(nindiv-1)
b2<-(2*(nindiv**2+nindiv+3))/(9*nindiv*(nindiv-1))
total_step=(nsnp-window_size)/step_wise
tajimad_list<-c()
fst_list<-c()
window_position<-matrix(nrow=total_step,ncol=2)
5. 启动 window sliding
##### run sliding windows
for(pace in 1:total_step){
print(pace/total_step)
start<-1+step_wise*(pace-1)
end<-1+step_wise*(pace-1)+window_size
res<-tajima_and_fst(data1[start:end,]) #### using the function
new_d<-res[[1]] ### saving result of D
new_fst<-res[[2]] ### saving result of fst
tajimad_list<-c(tajimad_list,new_d)
fst_list<-c(fst_list,new_fst)
window_position[pace,]<-c(start,end)
}
6. 储存结果,画图
#### concate output result
out<-data.frame(chr=pos_data[,1][1],
snp_position_start=pos_data[,2][window_position[,1]],
snp_position_end=pos_data[,2][window_position[,2]],
relative_start=window_position[,1],
relative_end=window_position[,2],
tajimasd=(tajimad_list-mean(tajimad_list))/sd(tajimad_list), #### 这里标准化、中心化了
fst=(fst_list-mean(fst_list))/sd(fst_list)) #### 这里标准化、中心化了
write.table(out,"chr18.tajimasd.fst.txt",sep="\t",quote = F,row.names = F)
head(out)
7. 画图
library(ggplot2)
ggplot(data=out)+
geom_point(aes(x=(snp_position_start+snp_position_end)/2,y=fst,color="fst"))
ggplot(data=out)+
geom_point(aes(x=(snp_position_start+snp_position_end)/2,y=tajimasd,color="tajimasd2"))
暂时还没搞懂怎么计算P值...好像要序列模拟,比较麻烦,后面再思考。或者套个正态分布看离群值?但这样有点武断,而且没有假设检验。
Tajima原文中Tajimas' D和计算机序列模拟的比对结果。因此绝对值大于2应该睡rule of thumb,应该就算显著。