R包——Mutational Pattern画突变频谱图
效果图:
原文:https://www.jianshu.com/p/1347a0ee74a0
rm(list=ls())
source("http://bioconductor.org/biocLite.R")
biocLite("pheatmap")
library(pkgmaker)
library(registry)
library(rngtools)
library(cluster)
library(NMF)
library(maftools)
library(pheatmap)
setwd("./")
#input files
#frequently_mutected_genes=read.table("D:\\LQ\\items\\project\\tmp_xuhuawei\\frequently_mutated_genes.txt",header = T)
var.annovar.maf = annovarToMaf(annovar = "D:\\LQ\\items\\project\\tmp_xuhuawei\\all_annovar3", Center = 'NA', refBuild = 'hg19', tsbCol = 'Tumor_Sample_Barcode', table = 'ensGene',sep = "\t")
write.table(x=var.annovar.maf,file="var_annovar_maf",quote= F,sep="\t",row.names=F)
var_maf = read.maf(maf ="var_annovar_maf")
#find mutation genes in samples
#oncoplot(maf = var_maf, top =100, writeMatrix=T,removeNonMutated = F,showTumorSampleBarcodes=T)
#selected concordant genes and get maf information
#sample_genes=read.table("onco_matrix.txt",header = F,sep = "\t",skip = 1)
#sample_mutected_genes=subset(sample_genes,sample_genes$V1 %in% frequently_mutected_genes$Symbol)
#sample_mutected_maf=subset(var.annovar.maf,var.annovar.maf$Hugo_Symbol %in% sample_mutected_genes$V1)
#write.table(x=sample_mutected_maf,file="sample_mutected_maf",quote= F,sep="\t",row.names=F)
#sample.mutected.maf=read.maf(maf="sample_mutected_maf")
laml.tnm = trinucleotideMatrix(maf = var_maf, ref_genome = "ucsc.hg19.fasta", useSyn = TRUE,add = TRUE,prefix="chr")
laml.sign = extractSignatures(mat = laml.tnm, nTry = 6, plotBestFitRes = F)
plotSignatures(laml.sign)
pheatmap::pheatmap(mat = laml.sign$coSineSimMat, cluster_rows = FALSE, main = "cosine similarity against validated signatures")
数据提取和筛选1
这是做毕设时刚开始接触R,大部分都是我的老师写的,我也是看了这几个R代码才慢慢入了门的,去看别人的程序来学习是一种快捷方式,后来很多次我都还会反复来看看
rm(list=ls())
library(stringr)
library(plyr)
library(xlsx)
library(Biostrings)
library(splitstackshape)
#质谱检测肽段长度 8到25个氨基酸,比较合适。
MS_seq_length_min = 8
MS_seq_length_max = 25
# input files
in_phospho_file = "Phosphorylation_site_dataset" #调控数据
in_MS_file = "huangyu_proteomicsDB_2.txt" # MS 质谱数据
in_protein_fasta_file = "Phosphosite_PTM_seq.fasta" # protein sequence data
# output files
out_phospho_antibody_human_p_file = "4.phospho_antibody_human_p.txt" # containing Human regulatoryhuman phospho sites
out_fasta_reg_file = "4.df_fasta_phospho_antibody.txt" # human protein sequences including human regulatoryhuman phospho sites
out_MS_data_reg_frequency_file = "4.MS_data_phospho_antibody_frequency.txt" # frequency of MS peptides that contain human regulatoryhuman phospho sites
out_phospho_antibody_MS_comb_file = "4.phospho_antibody_MS_comb.txt" # combination of regulatory phosphosite and MS peptides
out_phospho_antibody_MS_comb_MS_file = "4.phospho_antibody_MS_comb_seq_MS.txt" # # combination of regulatory phosphosite and MS peptides (length: 8-25)
# Read regulatory_site file (读取调控数据)
setwd("D:/LQ/phosphosite_2017_12_20/")
phospho_data = read.table(in_phospho_file, header=T, sep="\t", skip=3, comment.char="", stringsAsFactors=F) #读入磷酸化位点数据
phospho_antibody_human_p = subset(phospho_data, ORGANISM %in% "human" & grepl("p$", MOD_RSD) & nchar(CST_CAT.)>2) # 找出人类的磷酸化修饰的位点
phospho_antibody_human_p$pos = as.numeric(substr(phospho_antibody_human_p$MOD_RSD, 2, nchar(phospho_antibody_human_p$MOD_RSD)-2)) # 修饰位点数字化定位
write.table(phospho_antibody_human_p, out_phospho_antibody_human_p_file, quote=FALSE, sep="\t", row.names=FALSE) #读出phospho_antibody_human_p
#read human protein sequence file (fasta format)
fastaFile <- readAAStringSet(in_protein_fasta_file) #读入蛋白质序列
seq_name = names(fastaFile)
sequence = paste(fastaFile)
df_fasta <- data.frame(seq_name, sequence,stringsAsFactors=F)
df_fasta = cSplit(df_fasta,"seq_name","|",direction="wide")
colnames(df_fasta)[ncol(df_fasta)] = "PROTEIN_ID"
df_fasta[] <- lapply(df_fasta, as.character) # change factor class to character class
df_fasta = unique(df_fasta) # remove replicate rows
# retain protein sequences that contain antibody-targeted phosphosites
df_fasta_antibody = subset(df_fasta, PROTEIN_ID %in% phospho_antibody_human_p$ACC_ID) #根据蛋白质ID,找到磷酸化修饰位点所在蛋白质序列
#add 7x"_" before and after protein sequence for matching
df_fasta_antibody$SEQ = paste("_______", toupper(df_fasta_antibody$sequence), "_______",sep="") #将序列改为大写,并加上7个"-",便于匹配
write.table(df_fasta_antibody, out_fasta_reg_file, quote=FALSE, sep="\t", row.names=FALSE) #读出带有抗体的蛋白质序列df_fasta_antibody
#debug code: check if all antibody-targeted sites are included in the protein sequences
phospho_antibody_human_p_tmp = subset(phospho_antibody_human_p, ACC_ID %in% df_fasta_antibody$PROTEIN_ID )
if(nrow(phospho_antibody_human_p_tmp) != nrow(phospho_antibody_human_p)){stop("Error! the number of rows are different from Phosphosite ! track back!")}
# read peptide sequences detected using MS machine
MS_data = read.table(in_MS_file, header=T, sep="\t",na.strings = "NA", comment.char="", stringsAsFactors=F) #读入质谱数据
# Let both MS_data and phospho_dis_reg have the same swisprot ID
MS_data_anti = subset(MS_data, PROTEIN_ID %in% df_fasta_antibody$PROTEIN_ID) #根据蛋白质ID,筛选带有抗体的肽段
df_fasta_antibody = subset(df_fasta_antibody, PROTEIN_ID %in% MS_data$PROTEIN_ID) #从带有抗体的蛋白质序列中选出能与肽段匹配的蛋白质序列
# count the frequency of each MS peptide (#the more frequency, the easier to be detected)
MS_data_anti_frequency=as.data.frame(table(MS_data_reg$SEQUENCE), stringsAsFactors=F) #根据肽段出现频率,读出肽段丰度
MS_data_anti_frequency = subset(MS_data_reg_frequency, Freq>0)
write.table(MS_data_reg_frequency, out_MS_data_reg_frequency_file, quote=FALSE, sep="\t", row.names=FALSE) #读出肽段丰度
#找出蛋白质长度、带有抗体的磷酸化修饰的肽段序列,序列最大长度,及最大丰度
phospho_antibody_human_p$prot_length = NA # store protein length
phospho_antibody_human_p$MS_seq = NA # store the MS peptides that contain the antibody-targeted phosphosites
phospho_antibody_human_p$MS_seq_max = NA # store the MS peptide detected with the most times
phospho_antibody_human_p$MS_seq_max_length = NA # store the length of MS peptide detected with the most times
phospho_antibody_human_p$MS_seq_max_freq = NA # store the frequency of the MS peptide with the most times
if( nrow(phospho_antibody_human_p) ==0 ){exit("Error! Line 71. There is no content in the df phospho_antibody_human_p!")}
#填入数据
for(i in 1:nrow(phospho_antibody_human_p)){
phospho = phospho_antibody_human_p[i,]
df_fasta = subset(df_fasta_antibody, PROTEIN_ID %in% phospho$ACC_ID)
MS_data_tmp = subset(MS_data_anti, PROTEIN_ID %in% phospho$ACC_ID)
if(length(MS_data_tmp$SEQUENCE)<10){next} # omit the proteins if protein sequence length <10
MS_data_freq=as.data.frame(table(MS_data_tmp$SEQUENCE), stringsAsFactors=F) # summerize the MS peptides and frequency
pos = unlist(gregexpr(pattern =phospho$SITE_...7_AA,df_fasta$SEQ, ignore.case = T)) # position of phosphosite in protein sequence
if( phospho$pos %in% pos){ # if both positions are same (position stored in phosphosite and position by matching with protein sequence)
# Set the locations of MS peptides in protein sequence
seq_locations=str_locate(toupper(df_fasta$sequence), MS_data_freq$Var1) # return start and end position for each MS peptide
df_location_range=data.frame(matrix(seq_locations, ncol=2, byrow=F),stringsAsFactors=FALSE) # change matrix format to df
colnames(df_location_range) = c("pep_start", "pep_end")
MS_data_freq = cbind(MS_data_freq, df_location_range)
# Judge if the phosphosite is located within the MS peptides
MS_data_freq$pos_minus_start = phospho$pos - MS_data_freq$pep_start
MS_data_freq$pos_minus_end = phospho$pos - MS_data_freq$pep_end
MS_data_freq_inc_phospho = subset(MS_data_freq, pos_minus_start>0 & pos_minus_end<0)
# order the df if df has multiple MS peptides containing phosphosite, the one with max freq is ordered at the top
MS_data_freq_inc_phospho = MS_data_freq_inc_phospho[with(MS_data_freq_inc_phospho, order(-Freq)), ]
phospho_antibody_human_p$prot_length[i] = nchar(df_fasta$sequence)
phospho_antibody_human_p$MS_seq[i] = paste(MS_data_freq_inc_phospho$Var1, MS_data_freq_inc_phospho$Freq, collapse=";")
phospho_antibody_human_p$MS_seq_max[i] = MS_data_freq_inc_phospho$Var1[1]
phospho_antibody_human_p$MS_seq_max_length[i] = nchar(MS_data_freq_inc_phospho$Var1[1])
phospho_antibody_human_p$MS_seq_max_freq[i] = MS_data_freq_inc_phospho$Freq[1]
}
}
write.table(phospho_antibody_human_p, out_phospho_antibody_MS_comb_file, quote=FALSE, sep="\t", row.names=FALSE, na="")
# only select the regulatory phosphosites that are located in the MS peptides in the range of 8 to 25.
phospho_antibody_human_p_MS = subset(phospho_antibody_human_p, nchar(MS_seq_max)>=MS_seq_length_min & nchar(MS_seq_max)<= MS_seq_length_max )
write.table(phospho_antibody_human_p_MS, out_phospho_antibody_MS_comb_MS_file, quote=FALSE, sep="\t", row.names=FALSE, na="")
数据提取和筛选2
rm(list=ls())
library(hash)
library(stringr)
library(plyr)
library(xlsx)
library(Biostrings)
library(splitstackshape)
MS_seq_length_min = 6
MS_seq_length_max = 20
# Read regulatory_site and disease_site files
setwd("G:/projects/phosphosite_2017_12_20/")
regulatory_file = "Regulatory_sites.xls"
regulatory_data = read.xlsx(regulatory_file, sheetIndex=1, header=TRUE, stringsAsFactors=F)
regulatory_human_p = subset(regulatory_data, ORGANISM %in% "human" & grepl("p$", MOD_RSD))
regulatory_human_p$upper_peptide = toupper(regulatory_human_p$SITE_...7_AA)
#regulatory_human_p_kinase = subset(regulatory_human_p, grepl("kinase", PROT_TYPE, ignore.case=T))
#regulatory_human_p_enzyme = subset(regulatory_human_p, grepl("EC", PROT_TYPE))
disease_data = read.xlsx(regulatory_file, sheetIndex=2, header=TRUE, stringsAsFactors=F)
disease_data$upper_peptide = toupper(disease_data$SITE_...7_AA)
disease_human_p = subset(disease_data, ORGANISM %in% "human" & grepl("p$", MOD_RSD))
disease_human_p_changed = subset(disease_data, ORGANISM %in% "human" & grepl("p", MOD_RSD) & ALTERATION %in% c("increased", "decreased"))
columnames = c("GENE", "PROTEIN", "ACC_ID", "GENE_ID", "ORGANISM", "MOD_RSD", "SITE_...7_AA","LT_LIT", "MS_LIT", "MS_CST", "upper_peptide")
regulatory_human_p = regulatory_human_p[,columnames]
disease_human_p_changed = disease_human_p_changed[,columnames]
dis_reg_hu_p = rbind(regulatory_human_p, disease_human_p_changed)
dis_reg_hu_p = unique(dis_reg_hu_p)
dis_reg_hu_p$pos = as.numeric(substr(dis_reg_hu_p$MOD_RSD, 2, nchar(dis_reg_hu_p$MOD_RSD)-2))
write.table(dis_reg_hu_p, "1.dis_reg_hu_p.txt", quote=FALSE, sep="\t", row.names=FALSE)
# fasta file #
fastaFile <- readAAStringSet("Phosphosite_PTM_seq.fasta")
seq_name = names(fastaFile)
sequence = paste(fastaFile)
df_fasta <- data.frame(seq_name, sequence,stringsAsFactors=F)
df_fasta = cSplit(df_fasta,"seq_name","|",direction="wide")
colnames(df_fasta)[ncol(df_fasta)] = "PROTEIN_ID"
df_fasta[] <- lapply(df_fasta, as.character) # change factor df to character df
df_fasta = unique(df_fasta)
df_fasta_dis_reg = subset(df_fasta, PROTEIN_ID %in% dis_reg_hu_p$ACC_ID)
df_fasta_dis_reg$SEQ = paste("_______", toupper(df_fasta_dis_reg$sequence), "_______",sep="")
write.table(df_fasta_dis_reg, "1.df_fasta_dis_reg.txt", quote=FALSE, sep="\t", row.names=FALSE)
# track_back check
dis_reg_hu_p_tmp = subset(dis_reg_hu_p, ACC_ID %in% df_fasta_dis_reg$PROTEIN_ID )
if(nrow(dis_reg_hu_p_tmp) != nrow(dis_reg_hu_p)){stop("Error! the number of rows are different from Phosphosite ! track back!")}
# read MS peptides #############
MS_data = read.table("huangyu_proteomicsDB_2.txt", header=T, sep="\t",na.strings = "NA", comment.char="", stringsAsFactors=F)
#keep both MS_data and phospho_dis_reg have the same swisprot ID
MS_data_reg_dis = subset(MS_data, PROTEIN_ID %in% df_fasta_dis_reg$PROTEIN_ID)
df_fasta_dis_reg = subset(df_fasta_dis_reg, PROTEIN_ID %in% MS_data$PROTEIN_ID)
MS_data_reg_dis_frequency=as.data.frame(table(MS_data_reg_dis$SEQUENCE), stringsAsFactors=F)
MS_data_reg_dis_frequency = subset(MS_data_reg_dis_frequency, Freq>0)
write.table(MS_data_reg_dis_frequency, "1.MS_data_reg_dis_frequency.txt", quote=FALSE, sep="\t", row.names=FALSE)
dis_reg_hu_p$prot_length = NA
dis_reg_hu_p$MS_seq = NA
dis_reg_hu_p$MS_seq_max = NA
dis_reg_hu_p$MS_seq_max_length = NA
dis_reg_hu_p$MS_seq_max_freq = NA
for(i in 1:nrow(dis_reg_hu_p)){
phospho = dis_reg_hu_p[i,]
df_fasta = subset(df_fasta_dis_reg, PROTEIN_ID %in% phospho$ACC_ID)
MS_data_tmp = subset(MS_data_reg_dis, PROTEIN_ID %in% phospho$ACC_ID)
if(length(MS_data_tmp$SEQUENCE)<10){next} # next if protein seq length <10
MS_data_freq=as.data.frame(table(MS_data_tmp$SEQUENCE), stringsAsFactors=F) # summerize the MS peptides and frequency
pos = unlist(gregexpr(pattern =phospho$SITE_...7_AA,df_fasta$SEQ, ignore.case = T)) # position of phosphosite in protein sequence
if( phospho$pos %in% pos){ # if both positions are same (position stored in phosphosite and position by matching with protein sequence)
# Set the locations of MS peptides in protein sequence
seq_locations=str_locate(toupper(df_fasta$sequence), MS_data_freq$Var1) # return start and end position for each MS peptide
df_location_range=data.frame(matrix(seq_locations, ncol=2, byrow=F),stringsAsFactors=FALSE)
colnames(df_location_range) = c("pep_start", "pep_end")
MS_data_freq = cbind(MS_data_freq, df_location_range)
# Judge if the phosphosite is located within the MS peptides
MS_data_freq$pos_minus_start = phospho$pos - MS_data_freq$pep_start
MS_data_freq$pos_minus_end = phospho$pos - MS_data_freq$pep_end
MS_data_freq_inc_phospho = subset(MS_data_freq, pos_minus_start>0 & pos_minus_end<0)
# order the df if df has multiple MS peptides containing phosphosite, the one with max freq is ordered at the top
MS_data_freq_inc_phospho = MS_data_freq_inc_phospho[with(MS_data_freq_inc_phospho, order(-Freq)), ]
dis_reg_hu_p$prot_length[i] = nchar(df_fasta$sequence)
dis_reg_hu_p$MS_seq[i] = paste(MS_data_freq_inc_phospho$Var1, MS_data_freq_inc_phospho$Freq, collapse=";")
dis_reg_hu_p$MS_seq_max[i] = MS_data_freq_inc_phospho$Var1[1]
dis_reg_hu_p$MS_seq_max_length[i] = nchar(MS_data_freq_inc_phospho$Var1[1])
dis_reg_hu_p$MS_seq_max_freq[i] = MS_data_freq_inc_phospho$Freq[1]
}
}
write.table(dis_reg_hu_p, "1.phospho_reg_dis_MS_comb.txt", quote=FALSE, sep="\t", row.names=FALSE, na="")
dis_reg_hu_p_select = subset(dis_reg_hu_p, nchar(MS_seq_max)>=MS_seq_length_min & nchar(MS_seq_max)<= MS_seq_length_max )
rm(list=ls())
library(stringr)
library(plyr)
library(xlsx)
library(Biostrings)
library(splitstackshape)
#质谱检测肽段长度 8到25个氨基酸,比较合适。
MS_seq_length_min = 8
MS_seq_length_max = 25
# input files
in_regulatory_file = "Regulatory_sites.xls" #调控数据
in_MS_file = "huangyu_proteomicsDB_2.txt" # MS 质谱数据
in_protein_fasta_file = "Phosphosite_PTM_seq.fasta" # protein sequence data
# output files
out_regulatory_human_p_file = "2.regulatory_human_p.txt" # containing Human regulatoryhuman phospho sites
out_fasta_reg_file = "2.df_fasta_reg.txt" # human protein sequences including human regulatoryhuman phospho sites
out_MS_data_reg_frequency_file = "2.MS_data_reg_frequency.txt" # frequency of MS peptides that contain human regulatoryhuman phospho sites
out_phospho_reg_MS_comb_file = "2.phospho_reg_MS_comb.txt" # combination of regulatory phosphosite and MS peptides
out_phospho_reg_MS_comb_MS_file = "2.phospho_reg_MS_comb_seq_MS.txt" # # combination of regulatory phosphosite and MS peptides (length: 8-25)
# Read regulatory_site file (读取调控数据)
setwd("G:/projects/phosphosite_2017_12_20/")
regulatory_data = read.xlsx(in_regulatory_file, sheetIndex=1, header=TRUE, stringsAsFactors=F)
regulatory_human_p = subset(regulatory_data, ORGANISM %in% "human" & grepl("p$", MOD_RSD)) # human and phospho modificatoin only
regulatory_human_p$pos = as.numeric(substr(regulatory_human_p$MOD_RSD, 2, nchar(regulatory_human_p$MOD_RSD)-2)) # retrive modification positions
write.table(regulatory_human_p, out_regulatory_human_p_file, quote=FALSE, sep="\t", row.names=FALSE)
#read human protein sequence file (fasta format)
fastaFile <- readAAStringSet(in_protein_fasta_file)
seq_name = names(fastaFile)
sequence = paste(fastaFile)
df_fasta <- data.frame(seq_name, sequence,stringsAsFactors=F)
df_fasta = cSplit(df_fasta,"seq_name","|",direction="wide")
colnames(df_fasta)[ncol(df_fasta)] = "PROTEIN_ID"
df_fasta[] <- lapply(df_fasta, as.character) # change factor class to character class
df_fasta = unique(df_fasta) # remove replicate rows
# retain protein sequences that contain regulatory phosphosites
df_fasta_reg = subset(df_fasta, PROTEIN_ID %in% regulatory_human_p$ACC_ID)
#add 7x"_" before and after protein sequence for matching
df_fasta_reg$SEQ = paste("_______", toupper(df_fasta_reg$sequence), "_______",sep="")
write.table(df_fasta_reg, out_fasta_reg_file, quote=FALSE, sep="\t", row.names=FALSE)
#debug code: check if all regulatory sites are included in the protein sequences
regulatory_human_p_tmp = subset(regulatory_human_p, ACC_ID %in% df_fasta_reg$PROTEIN_ID )
if(nrow(regulatory_human_p_tmp) != nrow(regulatory_human_p)){stop("Error! the number of rows are different from Phosphosite ! track back!")}
# read peptide sequences detected using MS machine #############
MS_data = read.table(in_MS_file, header=T, sep="\t",na.strings = "NA", comment.char="", stringsAsFactors=F)
# Let both MS_data and phospho_dis_reg have the same swisprot ID
MS_data_reg = subset(MS_data, PROTEIN_ID %in% df_fasta_reg$PROTEIN_ID)
df_fasta_reg = subset(df_fasta_reg, PROTEIN_ID %in% MS_data$PROTEIN_ID)
# count the frequency of each MS peptide (#the more frequency, the easier to be detected)
MS_data_reg_frequency=as.data.frame(table(MS_data_reg$SEQUENCE), stringsAsFactors=F)
MS_data_reg_frequency = subset(MS_data_reg_frequency, Freq>0)
write.table(MS_data_reg_frequency, out_MS_data_reg_frequency_file, quote=FALSE, sep="\t", row.names=FALSE)
regulatory_human_p$prot_length = NA # store protein length
regulatory_human_p$MS_seq = NA # store the MS peptides that contain the regulatoary phosphosites
regulatory_human_p$MS_seq_max = NA # store the MS peptide detected with the most times
regulatory_human_p$MS_seq_max_length = NA # store the length of MS peptide detected with the most times
regulatory_human_p$MS_seq_max_freq = NA # store the frequency of the MS peptide with the most times
for(i in 1:nrow(regulatory_human_p)){
phospho = regulatory_human_p[i,]
df_fasta = subset(df_fasta_reg, PROTEIN_ID %in% phospho$ACC_ID)
MS_data_tmp = subset(MS_data_reg, PROTEIN_ID %in% phospho$ACC_ID)
if(length(MS_data_tmp$SEQUENCE)<10){next} # omit the proteins if protein sequence length <10
MS_data_freq=as.data.frame(table(MS_data_tmp$SEQUENCE), stringsAsFactors=F) # summerize the MS peptides and frequency
pos = unlist(gregexpr(pattern =phospho$SITE_...7_AA,df_fasta$SEQ, ignore.case = T)) # position of phosphosite in protein sequence
if( phospho$pos %in% pos){ # if both positions are same (position stored in phosphosite and position by matching with protein sequence)
# Set the locations of MS peptides in protein sequence
seq_locations=str_locate(toupper(df_fasta$sequence), MS_data_freq$Var1) # return start and end position for each MS peptide
df_location_range=data.frame(matrix(seq_locations, ncol=2, byrow=F),stringsAsFactors=FALSE)
colnames(df_location_range) = c("pep_start", "pep_end")
MS_data_freq = cbind(MS_data_freq, df_location_range)
# Judge if the phosphosite is located within the MS peptides
MS_data_freq$pos_minus_start = phospho$pos - MS_data_freq$pep_start
MS_data_freq$pos_minus_end = phospho$pos - MS_data_freq$pep_end
MS_data_freq_inc_phospho = subset(MS_data_freq, pos_minus_start>0 & pos_minus_end<0)
# order the df if df has multiple MS peptides containing phosphosite, the one with max freq is ordered at the top
MS_data_freq_inc_phospho = MS_data_freq_inc_phospho[with(MS_data_freq_inc_phospho, order(-Freq)), ]
regulatory_human_p$prot_length[i] = nchar(df_fasta$sequence)
regulatory_human_p$MS_seq[i] = paste(MS_data_freq_inc_phospho$Var1, MS_data_freq_inc_phospho$Freq, collapse=";")
regulatory_human_p$MS_seq_max[i] = MS_data_freq_inc_phospho$Var1[1]
regulatory_human_p$MS_seq_max_length[i] = nchar(MS_data_freq_inc_phospho$Var1[1])
regulatory_human_p$MS_seq_max_freq[i] = MS_data_freq_inc_phospho$Freq[1]
}
}
write.table(regulatory_human_p, out_phospho_reg_MS_comb_file, quote=FALSE, sep="\t", row.names=FALSE, na="")
# only select the regulatory phosphosites that are located in the MS peptides in the range of 8 to 25.
regulatory_human_p_MS = subset(regulatory_human_p, nchar(MS_seq_max)>=MS_seq_length_min & nchar(MS_seq_max)<= MS_seq_length_max )
write.table(regulatory_human_p_MS, out_phospho_reg_MS_comb_MS_file, quote=FALSE, sep="\t", row.names=FALSE, na="")
rm(list=ls())
library(hash)
library(stringr)
library(plyr)
library(xlsx)
library(Biostrings)
library(splitstackshape)
MS_seq_length_min = 8
MS_seq_length_max = 25
# Read disease_site file
setwd("D:/LQ/phosphosite_2017_12_20/")
regulatory_file = "Regulatory_sites.xls"
disease_data = read.xlsx(regulatory_file, sheetIndex=2, header=TRUE, stringsAsFactors=F) #读入disease数据
disease_data$upper_peptide = toupper(disease_data$SITE_...7_AA) #将SITE全部变为大写
disease_human_p = subset(disease_data, ORGANISM %in% "human" & grepl("p$", MOD_RSD)) #从disesase中筛选出human和磷酸化的
# only select phosphosites whose expression are changed in diseases
disease_human_p_changed = subset(disease_data, ORGANISM %in% "human" & grepl("p", MOD_RSD) & ALTERATION %in% c("increased", "decreased"))#选出磷酸化表达会改变的
disease_human_p_changed = unique(disease_human_p_changed) #合并相同行
disease_human_p_changed$pos = as.numeric(substr(disease_human_p_changed$MOD_RSD, 2, nchar(disease_human_p_changed$MOD_RSD)-2)) #定位蛋白质位点数字化
write.table(disease_human_p_changed, "3.disease_human_p_changed.txt", quote=FALSE, sep="\t", row.names=FALSE) #读出文件
# fasta file #
fastaFile <- readAAStringSet("Phosphosite_PTM_seq.fasta")
seq_name = names(fastaFile)
sequence = paste(fastaFile)
df_fasta <- data.frame(seq_name, sequence,stringsAsFactors=F)
df_fasta = cSplit(df_fasta,"seq_name","|",direction="wide")
colnames(df_fasta)[ncol(df_fasta)] = "PROTEIN_ID"
df_fasta[] <- lapply(df_fasta, as.character) # change factor df to character df
df_fasta = unique(df_fasta)
df_fasta_dis = subset(df_fasta, PROTEIN_ID %in% disease_human_p_changed$ACC_ID) #根据蛋白质ID,从fasta里选出与disease相同的数据
df_fasta_dis$SEQ = paste("_______", toupper(df_fasta_dis$sequence), "_______",sep="") #添加SEQ列,将sequence变为大写并在前面加7“-”
write.table(df_fasta_dis, "3.df_fasta_dis.txt", quote=FALSE, sep="\t", row.names=FALSE) #读出文件
# track_back check
disease_human_p_changed_tmp = subset(disease_human_p_changed, ACC_ID %in% df_fasta_dis$PROTEIN_ID )
if(nrow(disease_human_p_changed_tmp) != nrow(disease_human_p_changed)){stop("Error! the number of rows are different from Phosphosite ! track back!")}
#read MS peptides
MS_data = read.table("huangyu_proteomicsDB_2.txt", header=T, sep="\t",na.strings = "NA", comment.char="", stringsAsFactors=F)
#keep both MS_data and phospho_dis have the same swisprot ID
MS_data_dis = subset(MS_data, PROTEIN_ID %in% df_fasta_dis$PROTEIN_ID) #根据蛋白质id,从MS数据中筛选出与df_fasta_dis相同的数据
df_fasta_dis = subset(df_fasta_dis, PROTEIN_ID %in% MS_data$PROTEIN_ID) #根据蛋白质ID,从df_fasta_dis中筛选出MS数据中筛选出相同的数据
MS_data_dis_frequency=as.data.frame(table(MS_data_dis$SEQUENCE), stringsAsFactors=F) #根据序列得到肽段丰度
MS_data_dis_frequency = subset(MS_data_dis_frequency, Freq>0)
write.table(MS_data_dis_frequency, "3.MS_data_dis_frequency.txt", quote=FALSE, sep="\t", row.names=FALSE) #读出数据MS_data_dis_frequency
disease_human_p_changed$prot_length = NA
disease_human_p_changed$MS_seq = NA
disease_human_p_changed$MS_seq_max = NA
disease_human_p_changed$MS_seq_max_length = NA
disease_human_p_changed$MS_seq_max_freq = NA
for(i in 1:nrow(disease_human_p_changed)){
phospho = disease_human_p_changed[i,]
df_fasta = subset(df_fasta_dis, PROTEIN_ID %in% phospho$ACC_ID)
MS_data_tmp = subset(MS_data_dis, PROTEIN_ID %in% phospho$ACC_ID)
if(length(MS_data_tmp$SEQUENCE)<10){next} # next if protein seq length <10
MS_data_freq=as.data.frame(table(MS_data_tmp$SEQUENCE), stringsAsFactors=F) # summerize the MS peptides and frequency
pos = unlist(gregexpr(pattern =phospho$SITE_...7_AA,df_fasta$SEQ, ignore.case = T)) # position of phosphosite in protein sequence# if( phospho$pos %in% pos){ # if both positions are same (position stored in phosphosite and position by matching with protein sequence)
if( disease_human_p_changed$pos %in% pos){ # if both positions are same (position stored in phosphosite and position by matching with protein sequence)
# Set the locations of MS peptides in protein sequence
# return start and end position for each MS peptide
seq_locations=str_locate(toupper(df_fasta$sequence), MS_data_freq$Var1) #将da_fasta中seq变为大写,然后与质谱中的肽段匹配
df_location_range=data.frame(matrix(seq_locations, ncol=2, byrow=F),stringsAsFactors=FALSE) #等到定位后,另取两列,命名为“pep_start","pep_end"
colnames(df_location_range) = c("pep_start", "pep_end")
MS_data_freq = cbind(MS_data_freq, df_location_range) #将MS_data_freq与df_location_range列合并
# Judge if the phosphosite is located within the MS peptides
MS_data_freq$pos_minus_start = phospho$pos - MS_data_freq$pep_start
MS_data_freq$pos_minus_end = phospho$pos - MS_data_freq$pep_end
MS_data_freq_inc_phospho = subset(MS_data_freq, pos_minus_start>0 & pos_minus_end<0)
# order the df if df has multiple MS peptides containing phosphosite, the one with max freq is ordered at the top
MS_data_freq_inc_phospho = MS_data_freq_inc_phospho[with(MS_data_freq_inc_phospho, order(-Freq)), ] #选取丰度最高的肽段
disease_human_p_changed$prot_length[i] = nchar(df_fasta$sequence)
disease_human_p_changed$MS_seq[i] = paste(MS_data_freq_inc_phospho$Var1, MS_data_freq_inc_phospho$Freq, collapse=";")
disease_human_p_changed$MS_seq_max[i] = MS_data_freq_inc_phospho$Var1[1]
disease_human_p_changed$MS_seq_max_length[i] = nchar(MS_data_freq_inc_phospho$Var1[1])
disease_human_p_changed$MS_seq_max_freq[i] = MS_data_freq_inc_phospho$Freq[1]
}
}
write.table(disease_human_p_changed, "3.phospho_dis_MS_comb.txt", quote=FALSE, sep="\t", row.names=FALSE, na="")
disease_human_p_changed_select = subset(disease_human_p_changed, nchar(MS_seq_max)>=MS_seq_length_min & nchar(MS_seq_max)<= MS_seq_length_max )
write.table(disease_human_p_changed_select, "3.phospho_dis_MS_comb_seq_select.txt", quote=FALSE, sep="\t", row.names=FALSE, na="")
rm(list=ls())
library(plyr)
#设置工作目录
setwd("D:/LQ/phosphosite_2017_12_20")
#输入文件
input_fasta_reg="2.df_fasta_reg.txt"
input_egfr_protein_names="egfr_protein_names.txt"
#输出文件
outfile_not_in_fasta_protein="not_in_fasta_protein.txt"
outfile_fasta_egfr_protein_names_id="fasta_egfr_protein_names_id.txt"
#读入文件fasta_reg和egfr pathway proteins
fasta_reg= read.table(input_fasta_reg, header=T,sep="\t",stringsAsFactors=F, na.strings = "")
egfr_protein_names= read.table(input_egfr_protein_names, header=T, sep="\t")
#拆开蛋白质名字中的"GN:"
vs.test= unlist(strsplit(fasta_reg$seq_name_1, split=":"))
m.test = matrix(vs.test, ncol=2, byrow=T)
#给m.test列命名
colnames(m.test)[1:2]<-c("GN","gene_names")
df.convert = as.data.frame(m.test, stringsAsFactors=F)
#将基因和蛋白质id对应
fasta_gene_protein_id<-data.frame(m.test,fasta_reg$PROTEIN_ID)
#删除第一列“GN”
fasta_gene_protein_id = fasta_gene_protein_id[,-1]
#将egfr中蛋白质名称变为大写
egfr_protein_names<-toupper(egfr_protein_names$protein_names)
#将fasta_protein中的蛋白质与egfr_protein_names比对
fasta_egfr_protein = subset(egfr_protein_names, egfr_protein_names %in% fasta_gene_protein_id$gene_names)
fasta_egfr_protein=data.frame(fasta_egfr_protein)
#找出不在fasta文件中的蛋白质
not_in_fasta_protein = subset(egfr_protein_names, ! egfr_protein_names %in% fasta_gene_protein_id$gene_names)
not_in_fasta_protein=data.frame(not_in_fasta_protein)
write.table(not_in_fasta_protein,outfile_not_in_fasta_protein,quote=FALSE, sep="\t", row.names=FALSE, na="")
#找出fasta和egfr共有的蛋白质以及蛋白质id
fasta_egfr_protein_names_id=subset(fasta_gene_protein_id,fasta_gene_protein_id$gene_names%in%fasta_egfr_protein$fasta_egfr_protein)
write.table(fasta_egfr_protein_names_id,outfile_fasta_egfr_protein_names_id,quote=FALSE, sep="\t", row.names=FALSE, na="")
rm(list=ls())
#加载需要的包
library(Biostrings)
library(ShortRead)
setwd("D:/LQ/免疫/OEdata")
#读入PEAR DATA_1,将fastq文件转换为fasta文件
raw_pear<-readFastq("D:/LQ/免疫/raw_pear.fastq")
writeFasta(raw_pear,"raw_pear.fa")
#读入Pear拼接数据的fasta文件
raw_pear_data<-readDNAStringSet("raw_pear.fa")
raw_pear_seq_name=names(raw_pear_data)
raw_pear_seq<-paste(raw_pear_data)
df_raw_pear_data<-data.frame(raw_pear_seq_name,raw_pear_seq,stringsAsFactors=F)
#读入OD DATA
oe_fasta_1<-readDNAStringSet("ATCACG.fa")
oe_seq_name = names(oe_fasta_1)
OE_seq = paste(oe_fasta_1)
df_oe_fasta_1<- data.frame(oe_seq_name, OE_seq,stringsAsFactors=F)
#合并数据
total=merge(df_raw_pear_data,df_oe_fasta_1,by.x="raw_pear_seq_name", by.y="oe_seq_name", all=T)
total_woNA= na.omit(total)
#找出oe和raw的相同与不同序列
total_woNA$seq_identity=(total_woNA$raw_pear_seq == total_woNA$OE_seq)
total_woNA_raw_pear_in_oe = subset(total_woNA, seq_identity)
total_woNA_raw_pear_not_in_oe= subset(total_woNA, ! seq_identity)
a=total_woNA[,2:3]
f <- function(a) {
grepl(a[2],a[1], fixed=T)
}
total_woNA$seq_included=apply(total_woNA[,2:3], 1, f)
total_woNA_seq_included=subset(total_woNA,seq_included)
total_woNA_seq_not_include=subset(total_woNA,!seq_included)
#消除相同行
total_woNA_seq_included=unique(total_woNA_seq_included)
total_woNA_seq_not_include=unique(df_total_woNA_seq_not_include)
#找出oe不包含raw的序列长度
oe_length_not_included=nchar(total_woNA_seq_not_include$OE_seq)
raw_length_not_included=nchar(total_woNA_seq_not_include$raw_pear_seq)
df_length_not_include=data.frame(oe_length_not_included,raw_length_not_included)
#输出文件
write.table(total_woNA,"D:/LQ/免疫/raw_oe_total.txt",quote = F,sep="\t",row.names = F)
write.table(total_woNA_raw_pear_in_oe,"D:/LQ/免疫/raw_pear_in_oe.txt",quote = F,sep="\t",row.names = F)
write.table(total_woNA_raw_pear_not_in_oe,"D:/LQ/免疫/total_woNA_raw_pear_not_in_oe.txt",quote = F,sep="\t",row.names = F)
write.table(total_woNA_seq_included,"D:/LQ/免疫/oe_seq_included_raw_pear.txt",quote = F,sep="\t",row.names=F)
write.table(total_woNA_seq_not_include,"D:/LQ/免疫/oe_seq_not_included_raw_pear.txt",quote=F,sep="\t",row.names = F)
rm(list=ls())
library(Biostrings)
setwd("D:/LQ/免疫/OEdata")
#读入文件,fasta文件处理
od_data_1<-readDNAStringSet("ATCACG.fa")
seq_name = names(od_data_1)
sequence = paste(od_data_1)
df_od_data_1 <- data.frame(seq_name, sequence,stringsAsFactors=F)
df_od_data_1=unique(df_od_data_1)#消除重复行
#求序列长度
df_od_data_1$length=NA
sequence<-as.vector(df_od_data_1$sequence)
df_od_data_1$length=nchar(sequence)
df_od_data_1=df_od_data_1[order(df_od_data_1$length,decreasing = F),]#按序列长度大小排序,从小到大
write.table(df_od_data_1,"od_data_len.xls",quote = F,sep="\t",row.names = F)
#统计长度出现的频率
freq=table(df_od_data_1$length)
df_oe_data_len_freq=data.frame(freq)
write.table(df_oe_data_len_freq,"oe_data_len_freq.txt",quote = F,sep="\t",row.names = F)
rm(list=ls())
library(Biostrings)
setwd("D:/LQ/免疫/OEdata")
raw_pear_data<-readDNAStringSet("raw_pear.fa")
raw_pear_seq_name=names(raw_pear_data)
raw_pear_seq<-paste(raw_pear_data)
df_raw_pear_data<-data.frame(raw_pear_seq_name,raw_pear_seq,stringsAsFactors=F)
df_raw_pear_data$length=NA
df_raw_pear_data$length=nchar(raw_pear_seq)
df_raw_pear_data=df_raw_pear_data[order(df_raw_pear_data$length,decreasing=F),]
write.table(df_raw_pear_data,"raw_pear_length.txt",quote = F,sep="\t",row.names = F)
freq=table(df_raw_pear_data$length)
df_raw_pear_len_freq=data.frame(freq)
write.table(df_raw_pear_len_freq,"raw_pear_len_freq.txt",quote = F,sep="\t",row.names = F)