全网最全免疫组库分析流程+代码复现（二）

代码复现：

1.结果转换（使用TRUST4恢复）

因为TURST4恢复的结果不能直接适配vdjtools，所以需要进行列名更改。
示例：样品分为Control和Treatment两个组，每组有4个样品。

name <- read.table("./name.txt")
n <- 4 #numbers of sample each group
TCR_split <- function(matrix,output_TRA,output_TRB,output_TRD,output_TRG){
  trust4_results <- matrix
  TRA_results <- trust4_results[which(substr(trust4_results$v,1,4)=="TRAV"),]
  TRB_results <- trust4_results[which(substr(trust4_results$v,1,4)=="TRBV"),]
  TRD_results <- trust4_results[which(substr(trust4_results$v,1,4)=="TRDV"),]
  TRG_results <- trust4_results[which(substr(trust4_results$v,1,4)=="TRGV"),]
  TRA_results$freq <- unlist(lapply(TRA_results$count,function(x) x/sum(TRA_results$count)))
  TRB_results$freq <- unlist(lapply(TRB_results$count,function(x) x/sum(TRB_results$count)))
  TRD_results$freq <- unlist(lapply(TRD_results$count,function(x) x/sum(TRD_results$count)))
  TRG_results$freq <- unlist(lapply(TRG_results$count,function(x) x/sum(TRG_results$count)))
  write.table(TRA_results,file = output_TRA,quote = F,sep = "\t",row.names = F)
  write.table(TRB_results,file = output_TRB,quote = F,sep = "\t",row.names = F)
  write.table(TRD_results,file = output_TRD,quote = F,sep = "\t",row.names = F)
  write.table(TRG_results,file = output_TRG,quote = F,sep = "\t",row.names = F)
}
metadata_all <- data.frame(file_name="",sample_id="",
                           sample_type=c(rep("Control",n),rep("Treatment",n)),
                           tcr_type="")
metadata <- data.frame(file_name="",sample_id="",
                       sample_type=c(rep("Control",n*4),rep("Treatment",n*4)),
                       tcr_type="")
count_sum <- data.frame(name=name$V1,sum=rep(0,length(name)))
for (i in 1:nrow(name)){
  input <- paste0("./trust4/",name[i,"V1"],"_report.tsv")
  output <- paste0("./convert_results/",name[i,"V1"],".txt")
  trust4_results <- read.csv(input,sep="\t")
  trust4_results <- trust4_results[,1:8]
  colnames(trust4_results) <- c("count","freq","cdr3nt","cdr3aa","v","d","j","c")
  trust4_results <- trust4_results[-which(trust4_results$cdr3aa=="out_of_frame"),]
  row.names(trust4_results) <- 1:nrow(trust4_results)
  count_sum[i,"sum"] <- sum(trust4_results$count)
  write.table(trust4_results,file = output,quote = F,sep = "\t",row.names = F)
  output_TRA <- paste0("./convert_results/",name[i,"V1"],"_TRA.txt")
  output_TRB <- paste0("./convert_results/",name[i,"V1"],"_TRB.txt")
  output_TRD <- paste0("./convert_results/",name[i,"V1"],"_TRD.txt")
  output_TRG <- paste0("./convert_results/",name[i,"V1"],"_TRG.txt")
  TCR_split(trust4_results,output_TRA,output_TRB,output_TRD,output_TRG)
}
for (i in 1:nrow(name)){
  output <- paste0("./convert_results/",name[i,"V1"],".txt")
  metadata_all[i,"file_name"] <- output
  metadata_all[i,"sample_id"] <- strsplit(name[i,"V1"],"_")[[1]][1]
  metadata_all[i,"tcr_type"] <- "TCR"
  metadata[4*i-3,"file_name"] <- output_TRA
  metadata[4*i-3,"tcr_type"] <- "TRA"
  metadata[4*i-2,"file_name"] <- output_TRB
  metadata[4*i-2,"tcr_type"] <- "TRB"
  metadata[4*i-1,"file_name"] <- output_TRD
  metadata[4*i-1,"tcr_type"] <- "TRD"
  metadata[4*i,"file_name"] <- output_TRG
  metadata[4*i,"tcr_type"] <- "TRG"
  for (k in 3:0){
    metadata[4*i-k,"sample_id"] <- strsplit(name[i,"V1"],"_")[[1]][1]
  }

}
metadata_all$sample_id <- paste0(metadata_all$sample_id,"_",metadata_all$tcr_type)
metadata$sample_id <- paste0(metadata$sample_id,"_",metadata$tcr_type)
write.table(metadata_all,file = "./metadata_all.txt",sep="\t",row.names = F,quote = F)
write.table(metadata,file = "./metadata.txt",sep="\t",row.names = F,quote = F)
write.table(count_sum,file = "./count_sum.txt",sep="\t",row.names = F,quote = F)

metadata_all是没有拆分TRA TRB TRD TRG的

file_name	sample_id	sample_type	tcr_type
./convert_results/MSTCR_01.txt	MSTCR_01	Control	TCR
./convert_results/MSTCR_02.txt	MSTCR_02	Control	TCR
./convert_results/MSTCR_03.txt	MSTCR_03	Control	TCR
./convert_results/MSTCR_04.txt	MSTCR_04	Control	TCR
./convert_results/MSTCR_05.txt	MSTCR_05	Treatment	TCR
./convert_results/MSTCR_06.txt	MSTCR_06	Treatment	TCR
./convert_results/MSTCR_07.txt	MSTCR_07	Treatment	TCR
./convert_results/MSTCR_08.txt	MSTCR_08	Treatment	TCR

metadata是拆分TRA TRB TRD TRG四条链的

file_name	sample_id	sample_type	tcr_type
./convert_results/MSTCR_01_TRA.txt	MSTCR_01_TRA	Control	TRA
./convert_results/MSTCR_01_TRB.txt	MSTCR_01_TRB	Control	TRB
./convert_results/MSTCR_01_TRD.txt	MSTCR_01_TRD	Control	TRD
./convert_results/MSTCR_01_TRG.txt	MSTCR_01_TRG	Control	TRG
./convert_results/MSTCR_02_TRA.txt	MSTCR_02_TRA	Control	TRA
./convert_results/MSTCR_02_TRB.txt	MSTCR_02_TRB	Control	TRB
./convert_results/MSTCR_02_TRD.txt	MSTCR_02_TRD	Control	TRD
./convert_results/MSTCR_02_TRG.txt	MSTCR_02_TRG	Control	TRG
./convert_results/MSTCR_03_TRA.txt	MSTCR_03_TRA	Control	TRA
./convert_results/MSTCR_03_TRB.txt	MSTCR_03_TRB	Control	TRB
./convert_results/MSTCR_03_TRD.txt	MSTCR_03_TRD	Control	TRD
./convert_results/MSTCR_03_TRG.txt	MSTCR_03_TRG	Control	TRG
./convert_results/MSTCR_04_TRA.txt	MSTCR_04_TRA	Control	TRA
./convert_results/MSTCR_04_TRB.txt	MSTCR_04_TRB	Control	TRB
./convert_results/MSTCR_04_TRD.txt	MSTCR_04_TRD	Control	TRD
./convert_results/MSTCR_04_TRG.txt	MSTCR_04_TRG	Control	TRG
./convert_results/MSTCR_05_TRA.txt	MSTCR_05_TRA	Treatment	TRA
./convert_results/MSTCR_05_TRB.txt	MSTCR_05_TRB	Treatment	TRB
./convert_results/MSTCR_05_TRD.txt	MSTCR_05_TRD	Treatment	TRD
./convert_results/MSTCR_05_TRG.txt	MSTCR_05_TRG	Treatment	TRG
./convert_results/MSTCR_06_TRA.txt	MSTCR_06_TRA	Treatment	TRA
./convert_results/MSTCR_06_TRB.txt	MSTCR_06_TRB	Treatment	TRB
./convert_results/MSTCR_06_TRD.txt	MSTCR_06_TRD	Treatment	TRD
./convert_results/MSTCR_06_TRG.txt	MSTCR_06_TRG	Treatment	TRG
./convert_results/MSTCR_07_TRA.txt	MSTCR_07_TRA	Treatment	TRA
./convert_results/MSTCR_07_TRB.txt	MSTCR_07_TRB	Treatment	TRB
./convert_results/MSTCR_07_TRD.txt	MSTCR_07_TRD	Treatment	TRD
./convert_results/MSTCR_07_TRG.txt	MSTCR_07_TRG	Treatment	TRG
./convert_results/MSTCR_08_TRA.txt	MSTCR_08_TRA	Treatment	TRA
./convert_results/MSTCR_08_TRB.txt	MSTCR_08_TRB	Treatment	TRB
./convert_results/MSTCR_08_TRD.txt	MSTCR_08_TRD	Treatment	TRD
./convert_results/MSTCR_08_TRG.txt	MSTCR_08_TRG	Treatment	TRG

count_sum是每个sample的总counts数，用于结果标准化。

name	sum
MSTCR_01	182672
MSTCR_02	159604
MSTCR_03	236935
MSTCR_04	169434
MSTCR_05	79721
MSTCR_06	34295
MSTCR_07	45231
MSTCR_08	40237

2.结果标准化（可选）

size=500000 #选择合适的counts_sum进行抽样
name <- read.table("./name.txt")
TCR_split <- function(matrix,output_TRA,output_TRB,output_TRD,output_TRG){
  trsut4_results <- matrix
  TRA_results <- trsut4_results[which(substr(trsut4_results$v,1,4)=="TRAV"),]
  TRB_results <- trsut4_results[which(substr(trsut4_results$v,1,4)=="TRBV"),]
  TRD_results <- trsut4_results[which(substr(trsut4_results$v,1,4)=="TRDV"),]
  TRG_results <- trsut4_results[which(substr(trsut4_results$v,1,4)=="TRGV"),]
  write.table(TRA_results,file = output_TRA,quote = F,sep = "\t",row.names = F)
  write.table(TRB_results,file = output_TRB,quote = F,sep = "\t",row.names = F)
  write.table(TRD_results,file = output_TRD,quote = F,sep = "\t",row.names = F)
  write.table(TRG_results,file = output_TRG,quote = F,sep = "\t",row.names = F)
}
tcr_normalization <- function(trust4_results,size){
  pool <- c()
  for (i in 1:nrow(trust4_results)){
    pool <- append(pool,rep(i,trust4_results[i,"count"]))
  }
  all <- as.numeric(sample(pool,size = size,replace = F))
  count <- table(all)
  trust4_results <- trust4_results[unique(all),]
  trust4_results$count <- count
  trust4_results <- trust4_results[order(trust4_results$count,decreasing = T),]
  rownames(trust4_results) <- 1:nrow(trust4_results)
  return(trust4_results)
}
metadata_all_normal <- data.frame(file_name="",sample_id="",
                                  sample_type=c(rep("Con",n),rep("Bgal",n),rep("AH1",n),rep("A5",n)),
                                  tcr_type="")
metadata_normal <- data.frame(file_name="",sample_id="",
                              sample_type=c(rep("Con",n*4),rep("Bgal",n*4),rep("AH1",n*4),rep("A5",n*4)),
                              tcr_type="")
for (i in 1:nrow(name)){
  input <- paste0("./trust4/",name[i,"V1"],"_report.tsv")
  output <- paste0("./convert_results_normal/",name[i,"V1"],".txt")
  trust4_results <- read.csv(input,sep="\t")
  trust4_results <- trust4_results[,1:8]
  colnames(trust4_results) <- c("count","freq","cdr3nt","cdr3aa","v","d","j","c")
  trust4_results <- trust4_results[-which(trust4_results$cdr3aa=="out_of_frame"),]
  row.names(trust4_results) <- 1:nrow(trust4_results)
  trust4_results <- tcr_normalization(trust4_results,size)
  write.table(trust4_results,file = output,quote = F,sep = "\t",row.names = F)
  output_TRA <- paste0("./convert_results_normal/",name[i,"V1"],"_TRA.txt")
  output_TRB <- paste0("./convert_results_normal/",name[i,"V1"],"_TRB.txt")
  output_TRD <- paste0("./convert_results_normal/",name[i,"V1"],"_TRD.txt")
  output_TRG <- paste0("./convert_results_normal/",name[i,"V1"],"_TRG.txt")
  TCR_split(trust4_results,output_TRA,output_TRB,output_TRD,output_TRG)
  metadata_all_normal[i,"file_name"] <- output
  metadata_all_normal[i,"sample_id"] <- strsplit(name[i,"V1"],"_")[[1]][1]
  metadata_all_normal[i,"tcr_type"] <- "TCR"
  metadata_normal[4*i-3,"file_name"] <- output_TRA
  metadata_normal[4*i-3,"tcr_type"] <- "TRA"
  metadata_normal[4*i-2,"file_name"] <- output_TRB
  metadata_normal[4*i-2,"tcr_type"] <- "TRB"
  metadata_normal[4*i-1,"file_name"] <- output_TRD
  metadata_normal[4*i-1,"tcr_type"] <- "TRD"
  metadata_normal[4*i,"file_name"] <- output_TRG
  metadata_normal[4*i,"tcr_type"] <- "TRG"
  for (k in 3:0){
    metadata_normal[4*i-k,"sample_id"] <- strsplit(name[i,"V1"],"_")[[1]][1]
  }
}
metadata_all_normal$sample_id <- paste0(metadata_all_normal$sample_id,"_",metadata_all_normal$tcr_type)
metadata_normal$sample_id <- paste0(metadata_normal$sample_id,"_",metadata_normal$tcr_type)
write.table(metadata_all_normal,file = "./metadata_all_normal.txt",sep="\t",row.names = F,quote = F)
write.table(metadata_normal,file = "./metadata_normal.txt",sep="\t",row.names = F,quote = F)

3.免疫组库多样性（Richness/Diversity/Clonality）的计算及可视化

3.1计算免疫组库多样性

metadata <- read.csv("./metadata.txt",sep = "\t")
Shannon_index <- function(list_p){
  sum=0
  for (p in list_p){
    sum <- p*log(p) + sum
  }
  H <- -sum
  return(H)
}
clonality <- function(diversity,richness){
  E<- diversity/log(richness)
  C <- 1-E
  return(C)
}
file_path <- metadata$file_name
results <- data.frame(sample=metadata$sample_id)
for (i in 1:length(file_path)){
  df <- read.csv(file = file_path[i],
                 stringsAsFactors = F,sep = "\t")
  temp <- try(list_p <- df[,"freq"])
  if ('try-error' %in% class(temp)){next}
  results[i,"richness"] <- nrow(df)
  results[i,"diversity"] <- Shannon_index(list_p)
  results[i,"clonality"] <- clonality(Shannon_index(list_p),nrow(df))
}
write.table(results,"./results/basic.txt",sep = "\t",row.names = F,quote = F)

3.2使用vdjtools计算多样性

java -jar e:/vdjtools/vdjtools-1.2.1.jar CalcDiversityStats -m .\metadata.txt .\results\

vdjtools的使用参考：https://www.jianshu.com/p/ea7edd430a5a

inputpath <- "./results/"
outputpath <- "./results/"
df <- read.csv(paste0(inputpath,"diversity.strict.exact.txt"), sep = "\t",stringsAsFactors = FALSE)
df$real_diversity <- log(df$shannonWienerIndex_mean)
df$clonality <- 1 - df$normalizedShannonWienerIndex_mean
for (k in c("TRA","TRB","TRD","TRG")){
  work_df <- df[which(df$tcr_type==k),]
  sample_name <- work_df$sample_id
  richness<- work_df$observedDiversity_mean
  diversity <- work_df$real_diversity
  clonality <- work_df$clonality
  work_df <- data.frame(sample_name,richness,diversity,clonality)
  write.table(work_df,file = paste0(outputpath,k,"-Basic.txt"),row.names = F,quote = F,sep="\t")
}

TRB-Basic.txt

sample_name	richness	diversity	clonality
MSTCR_01_TRB	35170	9.89085335938176	0.0551297471186711
MSTCR_02_TRB	29431	9.68714727061554	0.0585683225389396
MSTCR_03_TRB	50768	10.3132468709197	0.0481563088949662
MSTCR_04_TRB	34260	9.89423352728728	0.0524338444699065
MSTCR_05_TRB	7236	7.96452265160162	0.103782995009542
MSTCR_06_TRB	3534	7.2072238519678	0.117862903222087
MSTCR_07_TRB	3423	7.12183471831036	0.124896027125899
MSTCR_08_TRB	4354	7.35997369287484	0.121600997692666

#可视化
rm(list = ls())
inputpath <- "./results/"
outputpath <- "./output/"
library(ggplot2)
library(reshape2)
library(ggpubr)
df <- read.csv(paste0(inputpath,"diversity.strict.exact.txt"), sep = "\t",stringsAsFactors = FALSE)
df$real_diversity <- log(df$shannonWienerIndex_mean)
df$clonality <- 1 - df$normalizedShannonWienerIndex_mean
tcr_type <- "TRB"
work_df <- df[which(df$tcr_type==tcr_type),]
sample_name <- work_df$sample_id
Group <- work_df$sample_type
richness<- work_df$observedDiversity_mean
diversity <- work_df$real_diversity
clonality <- work_df$clonality
work_df <- data.frame(Group,richness,diversity,clonality)
work_df$Group <- factor(work_df$Group)  
work_res<- melt(work_df, id.vars = "Group")
work_richness <- work_res[which(work_res$variable=="richness"),]
work_diversity <- work_res[which(work_res$variable=="diversity"),]
work_clonality <- work_res[which(work_res$variable=="clonality"),]

p1 <- ggboxplot(work_diversity, x = "Group", y = "value",
                legend = "right",color = "Group", palette = "nejm",
                add = "jitter",add.params=list(color = "Group",size=3),
                title = "Diversity",  xlab = "",size = 0.5,width =0.5,
                ylab = paste0("Shannon Diversity Index of \n ",tcr_type," repertoire")) +
  ylim(c(0,max(work_diversity$value)+min(work_diversity$value)))+
  theme(axis.text.x =element_text(size=13), axis.title.y=element_text(size=16,face="bold"))
p1 + stat_compare_means(aes(group = work_df$Group),
                        label.x = 1.5,
                        label.y = max(work_diversity$value)+min(work_diversity$value),
                        label = "p.signif",
                        method = 'wilcox.test',hide.ns = TRUE,size=8)  
ggsave(paste0(outputpath,paste0(tcr_type,"-Diversity.pdf")),dpi = 600,width = 8,height = 8)
dev.off()
p2 <- ggboxplot(work_richness, x = "Group", y = "value",
                legend = "right",color = "Group", palette = "nejm",
                add = "jitter",add.params=list(color = "Group",size=3),
                title = "Richness",  xlab = "",size = 0.5,width =0.5,
                ylab = paste0("Richness of \n ",tcr_type," repertoire"))+
  ylim(c(0,max(work_richness$value)+min(work_richness$value)))+
  theme(axis.text.x =element_text(size=13), axis.title.y=element_text(size=16,face="bold"))
  
p2 + stat_compare_means(aes(group =work_df$Group),
                        label = "p.signif",label.x = 1.5,
                        label.y = max(work_richness$value)+min(work_richness$value),
                        method = 'wilcox.test',hide.ns = TRUE,size=8) 
ggsave(paste0(outputpath,paste0(tcr_type,"-Richness.pdf")),dpi = 600,width = 8,height = 8)
dev.off()
p3 <- ggboxplot(work_clonality, x = "Group", y = "value",
                legend = "right",color = "Group", palette = "nejm",
                add = "jitter",add.params=list(color = "Group",size=3),
                title = "Clonality",  xlab = "",size = 0.5,width =0.5,
                ylab = paste0("Clonality Index of \n ",tcr_type," repertoire"))+
  ylim(c(0,max(work_clonality$value)+min(work_clonality$value)))+
  theme(axis.text.x =element_text(size=13), axis.title.y=element_text(size=16,face="bold"))
p3 + stat_compare_means(aes(group =work_df$Group),  
                        label = "p.signif",label.x = 1.5,
                        label.y = max(work_clonality$value)+min(work_clonality$value),
                        method = 'wilcox.test',hide.ns = TRUE,size=8) 
ggsave(paste0(outputpath,paste0(tcr_type,"-Clonality.pdf")),dpi = 600,width = 8,height = 8)
dev.off()

4.CDR3 length distribution analysis

rm(list = ls())
library(ggplot2)
library(reshape2)
library(ggpubr)
lenth_distribution <- function(filepath,all_length,id){
  df <- read.csv(filepath,sep="\t",stringsAsFactors = F)
  for (i in 1:nrow(df)){
    cdr3 <- df[i,"cdr3aa"]
    all_length[as.character(nchar(cdr3)),id] <- all_length[as.character(nchar(cdr3)),id]+1
  }
  return(all_length)
}
all_lenth_distribution <- function(file_list,id_list,group_list){
  all_length <- data.frame()
  for (i in 1:length(file_list)){
    df <- read.csv(file_list[i],sep="\t",stringsAsFactors = F)
    for (k in 1:nrow(df)){
      cdr3 <- df[k,"cdr3aa"]
      all_length[as.character(nchar(cdr3)),id_list[i]] <- 0
    }
  }
  all_length[is.na(all_length)] <- 0
  for (i in 1:length(file_list)){
    all_length <- lenth_distribution(file_list[i],all_length,id_list[i])
    all_length["sample_type",id_list[i]]=group_list[i]
  }
  return(all_length)
}
name <- read.table("./metadata.txt",sep = "\t",header = 1)

work_tcr_type <- "TRA" #更改这里为所需要分析的链

file_list <- name$file_name[which(name$tcr_type==work_tcr_type)]
id_list <- name$sample_id[which(name$tcr_type==work_tcr_type)]
group_list <- c(rep("Control"4),rep("Treatment",4))
all_length <- all_lenth_distribution(file_list,id_list,group_list)
#---------absolute----------------#
#使用绝对数值进行绘图
if(F){
temp <- all_length[-nrow(all_length),]
temp <- temp[order(as.numeric(rownames(temp))),]
all_length <- rbind(temp,all_length[nrow(all_length),])}

#------------percent----------#
#使用占比百分比进行绘图
if(T){
temp <- all_length[-nrow(all_length),]
temp=as.data.frame(lapply(temp,as.numeric),row.names = rownames(temp))
colSums(temp)
temp <- data.frame(apply(temp,2,function(x) x/sum(x)))
temp <- temp[order(as.numeric(rownames(temp))),]
all_length <- rbind(temp,all_length[nrow(all_length),])}


####---check this and change--###
all_length <- all_length[c(6:14,nrow(all_length)),]#挑选合适的长度区间进行绘图


#draw picture
df <- all_length
work_df <- data.frame(t(df))
df <- melt(work_df, id="sample_type", variable.name="length",value.name = "count")
df$length <- substring(df$length,2)
df$count <- as.numeric(df$count)
df$length <- as.numeric(df$length)
df <- df[order(df$length),]
df$length <- as.factor(df$length)
mean <- aggregate(df$count, by=list(df$length,df$sample_type), FUN=mean)
sd <- aggregate(df$count, by=list(df$length, df$sample_type), FUN=sd)
len <- aggregate(df$count, by=list(df$length, df$sample_type), FUN=length)
df_res <- data.frame(mean, sd=sd$x, len=len$x)
colnames(df_res) = c("CDR3_Length", "Group", "Mean", "Sd", "Count")
df_res$Se <- df_res$Sd/sqrt(df_res$Count)
ggplot(df_res, mapping = aes(x=CDR3_Length, y=Mean, fill=Group,group=Group,color=Group)) +
  labs(xlab = "CDR3 Length",ylab="Mean",title =paste0("Distribution of ",work_tcr_type," CDR3 Length"))+
  geom_bar(stat="identity", position=position_dodge(),color="black", width=.8) +
  geom_errorbar(aes(ymin=Mean-Se, ymax=Mean +Se),
                position=position_dodge(.8), width=.2,color="black") +
  theme(axis.text.x = element_text(size = 14, color = "black"))+#设置x轴字体大小
  theme(axis.text.y = element_text(size = 14, color = "black"))+#设置y轴字体大小
  theme(title=element_text(size=13))+#设置标题字体大小
  theme_bw()+
  geom_line(size=0.5,position = position_dodge(0.8))+
  scale_y_continuous(expand = c(0,0),limits = c(0,0.3)) #注意更改Y轴范围
ggsave(paste0("./output/CDR3_lenth",work_tcr_type,".pdf"),width = 10,height = 8)

#堆叠图
library(ggsci)
ggplot(df_res,mapping = aes(x=Group,y=Mean,fill=CDR3_Length))+
  geom_col(position = 'stack', width = 0.6)+
  scale_color_aaas()+
  theme_bw()+scale_y_continuous(expand = c(0,0))
ggsave(paste0("./output/CDR3_lenth",work_tcr_type,"_stack.pdf"),width = 7,height = 7)

5.Gene usage analysis

java -jar e:/vdjtools/vdjtools-1.2.1.jar CalcSegmentUsage -m .\metadata.txt .\results\

rm(list=ls())
library(RColorBrewer)
library(pheatmap)
for (region in c("V","J")){
  for (type in c("TRA","TRB","TRD","TRG")){
    file_name <- paste0("./results/segments.wt.",region,".txt")
    df <- read.csv(file_name,sep="\t",row.names = 1)
    work_tcr_type <- type
    work_df <- df[which(df$tcr_type==work_tcr_type),]
    work_df <- work_df[,-1:-2]
    work_df <- t(work_df)
    work_df <- work_df[which(rowSums(work_df)>0),]
    work_df <- work_df*100
    #---------------------------------------------------------#
    annotation_col <- data.frame(c(rep("Control",4),rep("Treatment",4)))
    #---------------------------------------------------------#
    rownames(annotation_col) <- colnames(work_df)
    colnames(annotation_col) <- "Type"
    #coul <- colorRampPalette(brewer.pal(9, "OrRd"))(50)
    coul <-  colorRampPalette(colors = c("blue","white","red"))(100)
    pheatmap(work_df,
             filename = paste0("./output/",work_tcr_type,"_GeneUsage_",region,".pdf"),
             scale = "row",
             annotation_col = annotation_col,cluster_cols = FALSE,
             height = 15,width = 10,cutree_col =2,display_numbers = FALSE,labels_col = "",color = coul)
  }
}

6.氨基酸占比分析

metadata <- read.table("./metadata.txt",sep="\t",header = 1)
chr_freq <- function(str,chr){
  df <- data.frame()
  for (i in 1:nchar(str)){
    df[i,"cdr3"] <- substr(str,i,i)
  }
  p <- table(df)[chr]/nchar(str)
  return(p)
}
aa_freq <- function(df,chr){
  p <- c()
  for (i in 1:nrow(df)){
    cdr3 <- df[i,"cdr3aa"]
    p[i] <- chr_freq(cdr3,chr)
  }
  p_mean <- mean(p,na.rm = T)
  return(p_mean)
}
aa_name <- c("A","R","N","D","C","Q","E","G","H","I","L","K","M","F","P","S","T","W","Y","V")#要分析的氨基酸
file_list <- metadata$file_name
sample_id <- metadata$sample_id
p=data.frame()
for (i in 1:length(file_list)){
  print(paste0(file_list[i],"---running!"))
  df <- read.csv(file_list[i],sep="\t",stringsAsFactors = F)
  for (aa in aa_name){
    print(paste0(aa,"-------runnning!----------"))
    p[sample_id[i],aa] <- aa_freq(df,aa)
  }
}
p1 <- p*100
write.table(p1,file="./results/aa_frequency.txt",sep="\t",row.names = T,quote = F)

library(ggplot2)
library(reshape2)
library(ggpubr)
aa_fre <- read.csv("./results/aa_frequency.txt",sep="\t",row.names = 1)
aa_fre$Sample_type <- c(rep("Control",16),rep("Treatment",16))
aa_fre$tcr_type <- rep( c("TRA","TRB","TRD","TRG"),4)
aa_fre$Sample_type <- factor(aa_fre$Sample_type)
aa_fre$tcr_type <- factor(aa_fre$tcr_type)

for (tcr in c("TRA","TRB","TRD","TRG")){
  aa_fre_work <- aa_fre[which(aa_fre$tcr_type==tcr),]
  aa_fre_work <- aa_fre_work[,-ncol(aa_fre_work)]
  work_res <- melt(aa_fre_work,id.vars = "Sample_type")
  for (i in 1:(ncol(aa_fre_work)-1)){
    temp <- work_res[which(work_res$variable==colnames(aa_fre)[i]),]
    p1 <- ggpaired(temp, x = "Sample_type", y = "value",
                   line.color = "gray", line.size = 0.5,
                    legend = "right",color = "Sample_type", palette = "nejm",
                    add = "jitter",add.params=list(color = "Sample_type",size=3),
                    title = colnames(aa_fre)[i],  xlab = "",size = 0.5,width =0.5,
                    ylab = paste0("Percentage of ",tcr," aa frequency")) +
      theme(axis.text.x =element_text(size=13), axis.title.y=element_text(size=16,face="bold"))+
      theme(legend.position = 'none')
    p1 + stat_compare_means(aes(group = Sample_type),
                            label.x = 1.5,label = "p.signif",
                           # comparisons = list(c("Control","Treatment")),
                            method = 'wilcox.test',hide.ns = TRUE,size=8)
    ggsave(paste0("./output/aa_fre/",tcr,"_",colnames(aa_fre_work)[i],".png"))
  }
}