使用《OncoImmunology》一篇paper中的数据进行实战
- 数据获取地址:https://www.ncbi.nlm.nih.gov/bioproject/PRJEB33490
关于免疫组库的背景,MIXCR和VDJTOOLS的使用可以参考之前的文章
使用mixcr构建免疫组库及下游分析
使用vdjtools进行免疫组库分析
直接进入主题
1.下载数据
1.SRR_Acc_List.txt (所有测序数据的样本号,直接下载得到)
2.SraRunTable.txt (样本的分组信息,后面数据分析会用到)
#linux
prefetch -O ./rawdata --option-file ./SRR_Acc_List.txt #批量下载数据
cat SRR_Acc_List.txt | while read id; do fastq-dump --split-files --gzip -O ./rawdata/ ./rawdata/${id}/${id}.sra;done # sra to fastq
fastqc -o ./qc ./rawdata/*gz #质控
详细方法参考:RNA-seq转录组差异分析及可视化
2.使用MIXCR构建免疫组库
#只分析TRB
cat SRR_Acc_List.txt|while read name
do
mixcr analyze amplicon \
-s hsa \
--starting-material dna \
--5-end v-primers \
--3-end c-primers \
--adapters adapters-present \
--only-productive \
--receptor-type trb \
--report ./reports/${name}.report \
./rawdata/${name}_1.fastq.gz ./rawdata/${name}_2.fastq.gz ./results/${name}
done
详细方法参考:使用mixcr构建免疫组库及下游分析
3.VDJTOOLS分析
这一步需要用到之前下载的SraRunTable.txt,处理得到想要的分组文件。
前两列必须为“file_name”,"sample_id",后面列可根据需求进行分组
第一列是MIXCR恢复出来数据的路径,第二列为样本名称
vdjtools=~/project/biosoft/vdjtools/vdjtools-1.2.1/vdjtools-1.2.1.jar
mkdir ./results/TRB/VDJconvert/
mkdir ./results/vdjtools-results/
java -jar $vdjtools Convert -S mixcr -m ./results/TRB/metadata.txt ./results/TRB/VDJconvert/
java -jar $vdjtools CalcBasicStats -m ./results/TRB/VDJconvert/metadata.txt ./results/vdjtools-results/
java -jar $vdjtools CalcSegmentUsage -m ./results/TRB/VDJconvert/metadata.txt ./results/vdjtools-results/
java -jar $vdjtools RarefactionPlot -m ./results/TRB/VDJconvert/metadata.txt ./results/vdjtools-results/
java -jar $vdjtools CalcDiversityStats -m ./results/TRB/VDJconvert/metadata.txt ./results/vdjtools-results/
转换格式后在目标文件夹会生成一个新的metadata.txt
,后续分析使用此分组文件。
共得到6个结果文件分别为basicstats.txt
、diversity.strict.exact.txt
、diversity.strict.resampled.txt
、rarefaction.strict.txt
、segments.wt.J.txt
、segments.wt.V.txt
。
4.使用R包进行可视化处理
4.1 clonality diversity richness
inputpath <- "./results/vdjtools-results/"
outputpath <- "./R-results/"
library(ggplot2)
library(reshape2)
library(ggpubr)
df <- read.csv(paste0(inputpath,"diversity.strict.exact.txt"), sep = "\t",stringsAsFactors = FALSE)
df$real_diversity <- log(df$shannonWienerIndex_mean)
df$clonality <- 1 - df$normalizedShannonWienerIndex_mean
work_df <- df
sample_name <- work_df$sample_id
Group <- work_df$sample_type
richness<- work_df$observedDiversity_mean
diversity <- work_df$real_diversity
clonality <- work_df$clonality
work_df <- data.frame(Group,richness,diversity,clonality)
work_df$Group <- factor(work_df$Group)
work_res<- melt(work_df, id.vars = "Group")
work_richness <- work_res[which(work_res$variable=="richness"),]
work_diversity <- work_res[which(work_res$variable=="diversity"),]
work_clonality <- work_res[which(work_res$variable=="clonality"),]
p1 <- ggboxplot(work_diversity, x = "Group", y = "value",
legend = "right",color = "black", palette = "jama",
add = "jitter",add.params=list(color = "Group",size=3),
title = "Diversity", xlab = "",size = 0.5,width =0.5,
ylab = "Shannon Diversity Index of \n TCR repertoire") +
theme(axis.text.x =element_text(size=13), axis.title.y=element_text(size=16,face="bold"))
p1 + stat_compare_means(aes(group = work_df$Group),
label.x = 2,label = "p.signif",
method = 'wilcox.test',hide.ns = TRUE)
ggsave(paste0(outputpath,"Diversity.png"))
dev.off()
p2 <- ggboxplot(work_richness, x = "Group", y = "value",
legend = "right",color = "black", palette = "jama",
add = "jitter",add.params=list(color = "Group",size=3),
title = "Richness", xlab = "",size = 0.5,width =0.5,
ylab = "Richness of \n TCR repertoire")+
theme(axis.text.x =element_text(size=13), axis.title.y=element_text(size=16,face="bold"))
p2 + stat_compare_means(aes(group =work_df$Group),
label.x = 2,label = "p.signif",
method = 'wilcox.test',hide.ns = TRUE)
ggsave(paste0(outputpath,"Richness.png"))
dev.off()
p3 <- ggboxplot(work_clonality, x = "Group", y = "value",
legend = "right",color = "black", palette = "jama",
add = "jitter",add.params=list(color = "Group",size=3),
title = "Clonality", xlab = "",size = 0.5,width =0.5,
ylab = "Clonality Index of \n TCR repertoire")+
theme(axis.text.x =element_text(size=13), axis.title.y=element_text(size=16,face="bold"))
p3 + stat_compare_means(aes(group =work_df$Group),
label = "p.signif",label.x = 2,
method = 'wilcox.test',hide.ns = TRUE)
ggsave(paste0(outputpath,"Clonality.png"))
dev.off()
4.2Gene Usage热图绘制
inputpath <- "./results/vdjtools-results/"
outputpath <- "./R-results/"
library(pheatmap)
file_input <- c(paste0(inputpath,"segments.wt.V.txt"),
paste0(inputpath,"segments.wt.J.txt"))
file_output <- c(paste0(outputpath,"V_heatmap.png"),paste0(outputpath,"J_heatmap.png"))
for (i in 1:length(file_input)){
df <- read.csv(file_input[i],sep="\t",row.names = 1)
annotation_col <- data.frame(df$sample_type)
colnames(annotation_col) <- "Type"
rownames(annotation_col) <- rownames(df)
work_df <- df
work_df <- work_df[,-1:-2] # check this and change
work_df <- t(work_df)
work_df <- work_df[rowSums(work_df)!=0,]
pheatmap(work_df,filename = file_output[i],
annotation_col = annotation_col,
cluster_cols = FALSE, #height = 10,
display_numbers = FALSE,labels_col ="" )# check this and change
}
4.3 VJ基因的差异分析及火山图绘制
mymetadata <- read.csv("./results/TRB/VDJconvert/metadata.txt",sep="\t",stringsAsFactors = F)
sample_name <- mymetadata$sample_id
vgene_matrix <- function(df){
vgene_name <- c()
for (i in 1:nrow(df)){
if (df[i,"v"] %in% names(vgene_name)){
vgene_name[df[i,"v"]] <- vgene_name[df[i,"v"]] + df[i,"count"]
} else{
vgene_name[df[i,"v"]] <- df[i,"count"]
}
}
return(vgene_name)
}
jgene_matrix <- function(df){
jgene_name <- c()
for (i in 1:nrow(df)){
if (df[i,"j"] %in% names(jgene_name)){
jgene_name[df[i,"j"]] <- jgene_name[df[i,"j"]] + df[i,"count"]
} else{
jgene_name[df[i,"j"]] <- df[i,"count"]
}
}
return(jgene_name)
}
VGeneMatrix <- data.frame()
JGeneMatrix <- data.frame()
matrix_V <- function(df,V,sample_name){
for (i in 1:nrow(df)){
if (df[i,"v"] %in% colnames(V)){
V[is.na(V)] <- 0
V[sample_name[n],df[i,"v"]] <- V[sample_name[n],df[i,"v"]] + df[i,"count"]
} else{
V[sample_name[n],df[i,"v"]] <- df[i,"count"]
}
}
return(V)
}
matrix_J <- function(df,J,sample_name){
for (i in 1:nrow(df)){
if (df[i,"j"] %in% colnames(J)){
J[is.na(J)] <- 0
J[sample_name[n],df[i,"j"]] <- J[sample_name[n],df[i,"j"]] + df[i,"count"]
} else{
J[sample_name[n],df[i,"j"]] <- df[i,"count"]
}
}
return(J)
}
for (n in 1:length(sample_name)){
df <- read.csv(paste0("./results/TRB/VDJconvert/",sample_name[n],".txt"),sep="\t",stringsAsFactors = F)
# assign(paste0(sample_name[n],"_Vgene"),vgene_matrix(df))
# assign(paste0(sample_name[n],"_Jgene"),jgene_matrix(df)) 每个样本的表达情况
VGeneMatrix <- matrix_V(df,VGeneMatrix,sample_name)
JGeneMatrix <- matrix_J(df,JGeneMatrix,sample_name)
}
write.table(VGeneMatrix,file = paste0(outputpath,"matrix_V.txt"),sep="\t",quote = F)
write.table(JGeneMatrix,file = paste0(outputpath,"matrix_J.txt"),sep="\t",quote = F)
#########DESeq2#######
library(DESeq2)
library(ggpubr)
library(ggthemes)
mycounts <- t(JGeneMatrix)
mycounts <- mycounts[rowSums(mycounts)!=0,]
mymetas <- mymetadata[,2:3]
colnames(mycounts) == mymetas$sample_id
mymetas$sample_type <- factor(mymetas$sample_type)
dds <- DESeqDataSetFromMatrix(countData = mycounts, colData = mymetas,desig=~sample_type)
dds <- DESeq(dds)
res <- results(dds,contrast = c("sample_type","healthy","hematological_cancer"))
res <- data.frame(res)
DEG <- res[order(res$padj),]
DEG$change =as.factor(ifelse(DEG$pvalue < 0.05 & DEG$log2FoldChange > 0.5,'UP',
ifelse(DEG$pvalue < 0.05 & DEG$log2FoldChange < -0.5,'DOWN',
"NOT")))
DEG$logp = -log10(DEG$pvalue)
table(DEG$change)
DEG <- DEG[order(DEG$pvalue),]
up_genes <- head(row.names(DEG)[which(DEG$change == "UP")],10)
down_genes <- head(row.names(DEG)[which(DEG$change =="DOWN")],10)
DEG_top <- c(as.character(up_genes),as.character(down_genes))
DEG$label <- ""
DEG$label[match(DEG_top,row.names(DEG))] <- DEG_top
ggscatter(DEG, x = "log2FoldChange", y = "logp",
color = "change",
palette = c("#2f5688","#BBBBBB","#CC0000"),
size = 6,
label = DEG$label,
font.label = 10,
repel = T,
xlab = "log2FoldChange",
ylab ="-log10(p value)") + theme_base() +
geom_hline(yintercept = -log10(0.05),linetype="dashed")+
geom_vline(xintercept = c(-0.5,0.5),linetype="dashed")
ggsave("./results/J_gene_valcano.png",dpi=600)
dev.off()