【R】R代码记录

R包——Mutational Pattern画突变频谱图

效果图：
原文：https://www.jianshu.com/p/1347a0ee74a0

rm(list=ls())

source("http://bioconductor.org/biocLite.R")
biocLite("pheatmap")

library(pkgmaker)
library(registry)
library(rngtools)
library(cluster)
library(NMF)
library(maftools)
library(pheatmap)

setwd("./")

#input files
#frequently_mutected_genes=read.table("D:\\LQ\\items\\project\\tmp_xuhuawei\\frequently_mutated_genes.txt",header = T)
var.annovar.maf = annovarToMaf(annovar = "D:\\LQ\\items\\project\\tmp_xuhuawei\\all_annovar3", Center = 'NA', refBuild = 'hg19', tsbCol = 'Tumor_Sample_Barcode', table = 'ensGene',sep = "\t")
write.table(x=var.annovar.maf,file="var_annovar_maf",quote= F,sep="\t",row.names=F)
var_maf = read.maf(maf ="var_annovar_maf")

#find mutation genes in samples
#oncoplot(maf = var_maf, top =100, writeMatrix=T,removeNonMutated = F,showTumorSampleBarcodes=T)

#selected concordant genes and get maf information
#sample_genes=read.table("onco_matrix.txt",header = F,sep = "\t",skip = 1)
#sample_mutected_genes=subset(sample_genes,sample_genes$V1 %in% frequently_mutected_genes$Symbol)
#sample_mutected_maf=subset(var.annovar.maf,var.annovar.maf$Hugo_Symbol %in% sample_mutected_genes$V1)

#write.table(x=sample_mutected_maf,file="sample_mutected_maf",quote= F,sep="\t",row.names=F)

#sample.mutected.maf=read.maf(maf="sample_mutected_maf")


laml.tnm = trinucleotideMatrix(maf = var_maf, ref_genome = "ucsc.hg19.fasta", useSyn = TRUE,add = TRUE,prefix="chr")
laml.sign = extractSignatures(mat = laml.tnm, nTry = 6, plotBestFitRes = F)
plotSignatures(laml.sign)

pheatmap::pheatmap(mat = laml.sign$coSineSimMat, cluster_rows = FALSE, main = "cosine similarity against validated signatures")

数据提取和筛选1

这是做毕设时刚开始接触R，大部分都是我的老师写的，我也是看了这几个R代码才慢慢入了门的，去看别人的程序来学习是一种快捷方式，后来很多次我都还会反复来看看

rm(list=ls())
library(stringr)
library(plyr)
library(xlsx)
library(Biostrings)
library(splitstackshape)

#质谱检测肽段长度 8到25个氨基酸，比较合适。
MS_seq_length_min = 8
MS_seq_length_max = 25

# input files
in_phospho_file = "Phosphorylation_site_dataset" #调控数据
in_MS_file = "huangyu_proteomicsDB_2.txt" # MS 质谱数据
in_protein_fasta_file = "Phosphosite_PTM_seq.fasta" # protein sequence data

# output files
out_phospho_antibody_human_p_file = "4.phospho_antibody_human_p.txt" # containing Human regulatoryhuman phospho sites 
out_fasta_reg_file = "4.df_fasta_phospho_antibody.txt" # human protein sequences including human regulatoryhuman phospho sites 
out_MS_data_reg_frequency_file = "4.MS_data_phospho_antibody_frequency.txt" # frequency of MS peptides that contain human regulatoryhuman phospho sites 
out_phospho_antibody_MS_comb_file = "4.phospho_antibody_MS_comb.txt" # combination of regulatory phosphosite and MS peptides
out_phospho_antibody_MS_comb_MS_file = "4.phospho_antibody_MS_comb_seq_MS.txt" # # combination of regulatory phosphosite and MS peptides (length: 8-25)

# Read regulatory_site file （读取调控数据）
setwd("D:/LQ/phosphosite_2017_12_20/")
phospho_data = read.table(in_phospho_file, header=T, sep="\t", skip=3, comment.char="", stringsAsFactors=F)    #读入磷酸化位点数据
phospho_antibody_human_p = subset(phospho_data, ORGANISM %in% "human" & grepl("p$", MOD_RSD) & nchar(CST_CAT.)>2) # 找出人类的磷酸化修饰的位点

phospho_antibody_human_p$pos = as.numeric(substr(phospho_antibody_human_p$MOD_RSD, 2, nchar(phospho_antibody_human_p$MOD_RSD)-2)) # 修饰位点数字化定位
write.table(phospho_antibody_human_p, out_phospho_antibody_human_p_file, quote=FALSE, sep="\t", row.names=FALSE)    #读出phospho_antibody_human_p

#read human protein sequence file (fasta format)
fastaFile <- readAAStringSet(in_protein_fasta_file)            #读入蛋白质序列
seq_name = names(fastaFile)
sequence = paste(fastaFile)
df_fasta <- data.frame(seq_name, sequence,stringsAsFactors=F)
df_fasta = cSplit(df_fasta,"seq_name","|",direction="wide")
colnames(df_fasta)[ncol(df_fasta)] = "PROTEIN_ID"
df_fasta[] <- lapply(df_fasta, as.character) # change factor class to character class
df_fasta = unique(df_fasta) # remove replicate rows

# retain protein sequences that contain antibody-targeted phosphosites
df_fasta_antibody = subset(df_fasta, PROTEIN_ID %in% phospho_antibody_human_p$ACC_ID)   #根据蛋白质ID，找到磷酸化修饰位点所在蛋白质序列  
#add 7x"_" before and after protein sequence for matching
df_fasta_antibody$SEQ = paste("_______", toupper(df_fasta_antibody$sequence), "_______",sep="")  #将序列改为大写，并加上7个"-",便于匹配
write.table(df_fasta_antibody, out_fasta_reg_file, quote=FALSE, sep="\t", row.names=FALSE)      #读出带有抗体的蛋白质序列df_fasta_antibody

#debug code: check if all antibody-targeted sites are included in the protein sequences
phospho_antibody_human_p_tmp = subset(phospho_antibody_human_p, ACC_ID %in% df_fasta_antibody$PROTEIN_ID )
if(nrow(phospho_antibody_human_p_tmp) != nrow(phospho_antibody_human_p)){stop("Error! the number of rows are different from Phosphosite ! track back!")}

# read peptide sequences detected using MS machine 
MS_data = read.table(in_MS_file, header=T, sep="\t",na.strings = "NA", comment.char="", stringsAsFactors=F)   #读入质谱数据

# Let both MS_data and phospho_dis_reg have the same swisprot ID
MS_data_anti = subset(MS_data, PROTEIN_ID %in% df_fasta_antibody$PROTEIN_ID)    #根据蛋白质ID，筛选带有抗体的肽段
df_fasta_antibody = subset(df_fasta_antibody, PROTEIN_ID %in% MS_data$PROTEIN_ID)     #从带有抗体的蛋白质序列中选出能与肽段匹配的蛋白质序列

# count the frequency of each MS peptide (#the more frequency, the easier to be detected)
MS_data_anti_frequency=as.data.frame(table(MS_data_reg$SEQUENCE), stringsAsFactors=F)   #根据肽段出现频率，读出肽段丰度
MS_data_anti_frequency = subset(MS_data_reg_frequency, Freq>0)
write.table(MS_data_reg_frequency, out_MS_data_reg_frequency_file, quote=FALSE, sep="\t", row.names=FALSE)   #读出肽段丰度

#找出蛋白质长度、带有抗体的磷酸化修饰的肽段序列，序列最大长度，及最大丰度
phospho_antibody_human_p$prot_length = NA  # store protein length
phospho_antibody_human_p$MS_seq = NA  # store the MS peptides that contain the antibody-targeted phosphosites
phospho_antibody_human_p$MS_seq_max = NA  # store the MS peptide detected with the most times
phospho_antibody_human_p$MS_seq_max_length = NA # store the length of MS peptide detected with the most times
phospho_antibody_human_p$MS_seq_max_freq = NA # store the frequency of the MS peptide with the most times

if( nrow(phospho_antibody_human_p) ==0 ){exit("Error! Line 71. There is no content in the df phospho_antibody_human_p!")}
#填入数据
for(i in 1:nrow(phospho_antibody_human_p)){
  phospho = phospho_antibody_human_p[i,]
  df_fasta = subset(df_fasta_antibody, PROTEIN_ID %in% phospho$ACC_ID)
  MS_data_tmp = subset(MS_data_anti, PROTEIN_ID %in% phospho$ACC_ID)  
  if(length(MS_data_tmp$SEQUENCE)<10){next} # omit the proteins if protein sequence length <10 
  MS_data_freq=as.data.frame(table(MS_data_tmp$SEQUENCE), stringsAsFactors=F) # summerize the MS peptides and frequency
  pos = unlist(gregexpr(pattern =phospho$SITE_...7_AA,df_fasta$SEQ, ignore.case = T)) # position of phosphosite in protein sequence
  if( phospho$pos %in% pos){ # if both positions are same (position stored in phosphosite and position by matching with protein sequence)
    
    # Set the locations of MS peptides in protein sequence
    seq_locations=str_locate(toupper(df_fasta$sequence), MS_data_freq$Var1) # return start and end position for each MS peptide
    df_location_range=data.frame(matrix(seq_locations, ncol=2, byrow=F),stringsAsFactors=FALSE) # change matrix format to df
    colnames(df_location_range) = c("pep_start", "pep_end")
    MS_data_freq = cbind(MS_data_freq, df_location_range)
    
    # Judge if the phosphosite is located within the MS peptides
    MS_data_freq$pos_minus_start = phospho$pos - MS_data_freq$pep_start
    MS_data_freq$pos_minus_end = phospho$pos - MS_data_freq$pep_end
    MS_data_freq_inc_phospho = subset(MS_data_freq, pos_minus_start>0 & pos_minus_end<0)
    # order the df if df has multiple MS peptides containing phosphosite, the one with max freq is ordered at the top
    MS_data_freq_inc_phospho = MS_data_freq_inc_phospho[with(MS_data_freq_inc_phospho, order(-Freq)), ]  

    phospho_antibody_human_p$prot_length[i] = nchar(df_fasta$sequence)
    phospho_antibody_human_p$MS_seq[i] = paste(MS_data_freq_inc_phospho$Var1, MS_data_freq_inc_phospho$Freq, collapse=";")
    phospho_antibody_human_p$MS_seq_max[i] = MS_data_freq_inc_phospho$Var1[1]
    phospho_antibody_human_p$MS_seq_max_length[i] = nchar(MS_data_freq_inc_phospho$Var1[1])
    phospho_antibody_human_p$MS_seq_max_freq[i] = MS_data_freq_inc_phospho$Freq[1]
  }
}

write.table(phospho_antibody_human_p, out_phospho_antibody_MS_comb_file, quote=FALSE, sep="\t", row.names=FALSE, na="")

# only select the regulatory phosphosites that are located in the MS peptides in the range of 8 to 25. 
phospho_antibody_human_p_MS = subset(phospho_antibody_human_p, nchar(MS_seq_max)>=MS_seq_length_min & nchar(MS_seq_max)<= MS_seq_length_max )
write.table(phospho_antibody_human_p_MS, out_phospho_antibody_MS_comb_MS_file, quote=FALSE, sep="\t", row.names=FALSE, na="")

数据提取和筛选2

rm(list=ls())
library(hash)
library(stringr)
library(plyr)
library(xlsx)
library(Biostrings)
library(splitstackshape)

MS_seq_length_min = 6
MS_seq_length_max = 20

# Read regulatory_site and disease_site files
setwd("G:/projects/phosphosite_2017_12_20/")
regulatory_file = "Regulatory_sites.xls"
regulatory_data = read.xlsx(regulatory_file, sheetIndex=1, header=TRUE, stringsAsFactors=F)
regulatory_human_p = subset(regulatory_data, ORGANISM %in% "human" & grepl("p$", MOD_RSD))
regulatory_human_p$upper_peptide = toupper(regulatory_human_p$SITE_...7_AA)
#regulatory_human_p_kinase = subset(regulatory_human_p, grepl("kinase", PROT_TYPE, ignore.case=T))
#regulatory_human_p_enzyme = subset(regulatory_human_p, grepl("EC", PROT_TYPE))

disease_data = read.xlsx(regulatory_file, sheetIndex=2, header=TRUE, stringsAsFactors=F)
disease_data$upper_peptide = toupper(disease_data$SITE_...7_AA)
disease_human_p = subset(disease_data, ORGANISM %in% "human" & grepl("p$", MOD_RSD))
disease_human_p_changed = subset(disease_data, ORGANISM %in% "human" & grepl("p", MOD_RSD) & ALTERATION %in% c("increased", "decreased"))

columnames = c("GENE", "PROTEIN", "ACC_ID", "GENE_ID", "ORGANISM", "MOD_RSD", "SITE_...7_AA","LT_LIT", "MS_LIT", "MS_CST", "upper_peptide")
regulatory_human_p = regulatory_human_p[,columnames] 
disease_human_p_changed = disease_human_p_changed[,columnames]

dis_reg_hu_p = rbind(regulatory_human_p, disease_human_p_changed)
dis_reg_hu_p = unique(dis_reg_hu_p)
dis_reg_hu_p$pos = as.numeric(substr(dis_reg_hu_p$MOD_RSD, 2, nchar(dis_reg_hu_p$MOD_RSD)-2))
write.table(dis_reg_hu_p, "1.dis_reg_hu_p.txt", quote=FALSE, sep="\t", row.names=FALSE)

# fasta file #
fastaFile <- readAAStringSet("Phosphosite_PTM_seq.fasta")
seq_name = names(fastaFile)
sequence = paste(fastaFile)
df_fasta <- data.frame(seq_name, sequence,stringsAsFactors=F)
df_fasta = cSplit(df_fasta,"seq_name","|",direction="wide")
colnames(df_fasta)[ncol(df_fasta)] = "PROTEIN_ID"
df_fasta[] <- lapply(df_fasta, as.character) # change factor df to character df
df_fasta = unique(df_fasta)

df_fasta_dis_reg = subset(df_fasta, PROTEIN_ID %in% dis_reg_hu_p$ACC_ID)
df_fasta_dis_reg$SEQ = paste("_______", toupper(df_fasta_dis_reg$sequence), "_______",sep="")
write.table(df_fasta_dis_reg, "1.df_fasta_dis_reg.txt", quote=FALSE, sep="\t", row.names=FALSE)

# track_back check
dis_reg_hu_p_tmp = subset(dis_reg_hu_p, ACC_ID %in% df_fasta_dis_reg$PROTEIN_ID )
if(nrow(dis_reg_hu_p_tmp) != nrow(dis_reg_hu_p)){stop("Error! the number of rows are different from Phosphosite ! track back!")}

# read MS peptides #############
MS_data = read.table("huangyu_proteomicsDB_2.txt", header=T, sep="\t",na.strings = "NA", comment.char="", stringsAsFactors=F)

#keep both MS_data and phospho_dis_reg have the same swisprot ID
MS_data_reg_dis = subset(MS_data, PROTEIN_ID %in% df_fasta_dis_reg$PROTEIN_ID)
df_fasta_dis_reg = subset(df_fasta_dis_reg, PROTEIN_ID %in% MS_data$PROTEIN_ID)

MS_data_reg_dis_frequency=as.data.frame(table(MS_data_reg_dis$SEQUENCE), stringsAsFactors=F)
MS_data_reg_dis_frequency = subset(MS_data_reg_dis_frequency, Freq>0)
write.table(MS_data_reg_dis_frequency, "1.MS_data_reg_dis_frequency.txt", quote=FALSE, sep="\t", row.names=FALSE)

dis_reg_hu_p$prot_length = NA
dis_reg_hu_p$MS_seq = NA
dis_reg_hu_p$MS_seq_max = NA
dis_reg_hu_p$MS_seq_max_length = NA
dis_reg_hu_p$MS_seq_max_freq = NA
for(i in 1:nrow(dis_reg_hu_p)){
  phospho = dis_reg_hu_p[i,]
  df_fasta = subset(df_fasta_dis_reg, PROTEIN_ID %in% phospho$ACC_ID)
  MS_data_tmp = subset(MS_data_reg_dis, PROTEIN_ID %in% phospho$ACC_ID)
  if(length(MS_data_tmp$SEQUENCE)<10){next} # next if protein seq length <10 
  MS_data_freq=as.data.frame(table(MS_data_tmp$SEQUENCE), stringsAsFactors=F) # summerize the MS peptides and frequency
  pos = unlist(gregexpr(pattern =phospho$SITE_...7_AA,df_fasta$SEQ, ignore.case = T)) # position of phosphosite in protein sequence
  if( phospho$pos %in% pos){ # if both positions are same (position stored in phosphosite and position by matching with protein sequence)
    
    # Set the locations of MS peptides in protein sequence
    seq_locations=str_locate(toupper(df_fasta$sequence), MS_data_freq$Var1) # return start and end position for each MS peptide
    df_location_range=data.frame(matrix(seq_locations, ncol=2, byrow=F),stringsAsFactors=FALSE)
    colnames(df_location_range) = c("pep_start", "pep_end")
    MS_data_freq = cbind(MS_data_freq, df_location_range)
    
    # Judge if the phosphosite is located within the MS peptides
    MS_data_freq$pos_minus_start = phospho$pos - MS_data_freq$pep_start
    MS_data_freq$pos_minus_end = phospho$pos - MS_data_freq$pep_end
    MS_data_freq_inc_phospho = subset(MS_data_freq, pos_minus_start>0 & pos_minus_end<0)
    # order the df if df has multiple MS peptides containing phosphosite, the one with max freq is ordered at the top
    MS_data_freq_inc_phospho = MS_data_freq_inc_phospho[with(MS_data_freq_inc_phospho, order(-Freq)), ]  

    dis_reg_hu_p$prot_length[i] = nchar(df_fasta$sequence)
    dis_reg_hu_p$MS_seq[i] = paste(MS_data_freq_inc_phospho$Var1, MS_data_freq_inc_phospho$Freq, collapse=";")
    dis_reg_hu_p$MS_seq_max[i] = MS_data_freq_inc_phospho$Var1[1]
    dis_reg_hu_p$MS_seq_max_length[i] = nchar(MS_data_freq_inc_phospho$Var1[1])
    dis_reg_hu_p$MS_seq_max_freq[i] = MS_data_freq_inc_phospho$Freq[1]
  }
}

write.table(dis_reg_hu_p, "1.phospho_reg_dis_MS_comb.txt", quote=FALSE, sep="\t", row.names=FALSE, na="")

dis_reg_hu_p_select = subset(dis_reg_hu_p, nchar(MS_seq_max)>=MS_seq_length_min & nchar(MS_seq_max)<= MS_seq_length_max )

rm(list=ls())
library(stringr)
library(plyr)
library(xlsx)
library(Biostrings)
library(splitstackshape)

#质谱检测肽段长度 8到25个氨基酸，比较合适。
MS_seq_length_min = 8
MS_seq_length_max = 25

# input files
in_regulatory_file = "Regulatory_sites.xls" #调控数据
in_MS_file = "huangyu_proteomicsDB_2.txt" # MS 质谱数据
in_protein_fasta_file = "Phosphosite_PTM_seq.fasta" # protein sequence data

# output files
out_regulatory_human_p_file = "2.regulatory_human_p.txt" # containing Human regulatoryhuman phospho sites 
out_fasta_reg_file = "2.df_fasta_reg.txt" # human protein sequences including human regulatoryhuman phospho sites 
out_MS_data_reg_frequency_file = "2.MS_data_reg_frequency.txt" # frequency of MS peptides that contain human regulatoryhuman phospho sites 
out_phospho_reg_MS_comb_file = "2.phospho_reg_MS_comb.txt" # combination of regulatory phosphosite and MS peptides
out_phospho_reg_MS_comb_MS_file = "2.phospho_reg_MS_comb_seq_MS.txt" # # combination of regulatory phosphosite and MS peptides (length: 8-25)


# Read regulatory_site file （读取调控数据）
setwd("G:/projects/phosphosite_2017_12_20/")
regulatory_data = read.xlsx(in_regulatory_file, sheetIndex=1, header=TRUE, stringsAsFactors=F)
regulatory_human_p = subset(regulatory_data, ORGANISM %in% "human" & grepl("p$", MOD_RSD)) # human and phospho modificatoin only

regulatory_human_p$pos = as.numeric(substr(regulatory_human_p$MOD_RSD, 2, nchar(regulatory_human_p$MOD_RSD)-2)) # retrive modification positions
write.table(regulatory_human_p, out_regulatory_human_p_file, quote=FALSE, sep="\t", row.names=FALSE)

#read human protein sequence file (fasta format)
fastaFile <- readAAStringSet(in_protein_fasta_file)
seq_name = names(fastaFile)
sequence = paste(fastaFile)
df_fasta <- data.frame(seq_name, sequence,stringsAsFactors=F)
df_fasta = cSplit(df_fasta,"seq_name","|",direction="wide")
colnames(df_fasta)[ncol(df_fasta)] = "PROTEIN_ID"
df_fasta[] <- lapply(df_fasta, as.character) # change factor class to character class
df_fasta = unique(df_fasta) # remove replicate rows

# retain protein sequences that contain regulatory phosphosites
df_fasta_reg = subset(df_fasta, PROTEIN_ID %in% regulatory_human_p$ACC_ID)
#add 7x"_" before and after protein sequence for matching
df_fasta_reg$SEQ = paste("_______", toupper(df_fasta_reg$sequence), "_______",sep="")
write.table(df_fasta_reg, out_fasta_reg_file, quote=FALSE, sep="\t", row.names=FALSE)

#debug code: check if all regulatory sites are included in the protein sequences
regulatory_human_p_tmp = subset(regulatory_human_p, ACC_ID %in% df_fasta_reg$PROTEIN_ID )
if(nrow(regulatory_human_p_tmp) != nrow(regulatory_human_p)){stop("Error! the number of rows are different from Phosphosite ! track back!")}

# read peptide sequences detected using MS machine #############
MS_data = read.table(in_MS_file, header=T, sep="\t",na.strings = "NA", comment.char="", stringsAsFactors=F)

# Let both MS_data and phospho_dis_reg have the same swisprot ID
MS_data_reg = subset(MS_data, PROTEIN_ID %in% df_fasta_reg$PROTEIN_ID)
df_fasta_reg = subset(df_fasta_reg, PROTEIN_ID %in% MS_data$PROTEIN_ID)

# count the frequency of each MS peptide (#the more frequency, the easier to be detected)
MS_data_reg_frequency=as.data.frame(table(MS_data_reg$SEQUENCE), stringsAsFactors=F)
MS_data_reg_frequency = subset(MS_data_reg_frequency, Freq>0)
write.table(MS_data_reg_frequency, out_MS_data_reg_frequency_file, quote=FALSE, sep="\t", row.names=FALSE)

regulatory_human_p$prot_length = NA  # store protein length
regulatory_human_p$MS_seq = NA  # store the MS peptides that contain the regulatoary phosphosites
regulatory_human_p$MS_seq_max = NA  # store the MS peptide detected with the most times
regulatory_human_p$MS_seq_max_length = NA # store the length of MS peptide detected with the most times
regulatory_human_p$MS_seq_max_freq = NA # store the frequency of the MS peptide with the most times
for(i in 1:nrow(regulatory_human_p)){
  phospho = regulatory_human_p[i,]
  df_fasta = subset(df_fasta_reg, PROTEIN_ID %in% phospho$ACC_ID)
  MS_data_tmp = subset(MS_data_reg, PROTEIN_ID %in% phospho$ACC_ID)
  if(length(MS_data_tmp$SEQUENCE)<10){next} # omit the proteins if protein sequence length <10 
  MS_data_freq=as.data.frame(table(MS_data_tmp$SEQUENCE), stringsAsFactors=F) # summerize the MS peptides and frequency
  pos = unlist(gregexpr(pattern =phospho$SITE_...7_AA,df_fasta$SEQ, ignore.case = T)) # position of phosphosite in protein sequence
  if( phospho$pos %in% pos){ # if both positions are same (position stored in phosphosite and position by matching with protein sequence)
    
    # Set the locations of MS peptides in protein sequence
    seq_locations=str_locate(toupper(df_fasta$sequence), MS_data_freq$Var1) # return start and end position for each MS peptide
    df_location_range=data.frame(matrix(seq_locations, ncol=2, byrow=F),stringsAsFactors=FALSE)
    colnames(df_location_range) = c("pep_start", "pep_end")
    MS_data_freq = cbind(MS_data_freq, df_location_range)
    
    # Judge if the phosphosite is located within the MS peptides
    MS_data_freq$pos_minus_start = phospho$pos - MS_data_freq$pep_start
    MS_data_freq$pos_minus_end = phospho$pos - MS_data_freq$pep_end
    MS_data_freq_inc_phospho = subset(MS_data_freq, pos_minus_start>0 & pos_minus_end<0)
    # order the df if df has multiple MS peptides containing phosphosite, the one with max freq is ordered at the top
    MS_data_freq_inc_phospho = MS_data_freq_inc_phospho[with(MS_data_freq_inc_phospho, order(-Freq)), ]  

    regulatory_human_p$prot_length[i] = nchar(df_fasta$sequence)
    regulatory_human_p$MS_seq[i] = paste(MS_data_freq_inc_phospho$Var1, MS_data_freq_inc_phospho$Freq, collapse=";")
    regulatory_human_p$MS_seq_max[i] = MS_data_freq_inc_phospho$Var1[1]
    regulatory_human_p$MS_seq_max_length[i] = nchar(MS_data_freq_inc_phospho$Var1[1])
    regulatory_human_p$MS_seq_max_freq[i] = MS_data_freq_inc_phospho$Freq[1]
  }
}

write.table(regulatory_human_p, out_phospho_reg_MS_comb_file, quote=FALSE, sep="\t", row.names=FALSE, na="")

# only select the regulatory phosphosites that are located in the MS peptides in the range of 8 to 25. 
regulatory_human_p_MS = subset(regulatory_human_p, nchar(MS_seq_max)>=MS_seq_length_min & nchar(MS_seq_max)<= MS_seq_length_max )
write.table(regulatory_human_p_MS, out_phospho_reg_MS_comb_MS_file, quote=FALSE, sep="\t", row.names=FALSE, na="")

rm(list=ls())
library(hash)
library(stringr)
library(plyr)
library(xlsx)
library(Biostrings)
library(splitstackshape)

MS_seq_length_min = 8
MS_seq_length_max = 25

# Read disease_site file
setwd("D:/LQ/phosphosite_2017_12_20/")
regulatory_file = "Regulatory_sites.xls"
disease_data = read.xlsx(regulatory_file, sheetIndex=2, header=TRUE, stringsAsFactors=F) #读入disease数据
disease_data$upper_peptide = toupper(disease_data$SITE_...7_AA)  #将SITE全部变为大写
disease_human_p = subset(disease_data, ORGANISM %in% "human" & grepl("p$", MOD_RSD))  #从disesase中筛选出human和磷酸化的

# only select phosphosites whose expression are changed in diseases
disease_human_p_changed = subset(disease_data, ORGANISM %in% "human" & grepl("p", MOD_RSD) & ALTERATION %in% c("increased", "decreased"))#选出磷酸化表达会改变的
disease_human_p_changed = unique(disease_human_p_changed)  #合并相同行
disease_human_p_changed$pos = as.numeric(substr(disease_human_p_changed$MOD_RSD, 2, nchar(disease_human_p_changed$MOD_RSD)-2))   #定位蛋白质位点数字化
write.table(disease_human_p_changed, "3.disease_human_p_changed.txt", quote=FALSE, sep="\t", row.names=FALSE)    #读出文件

# fasta file #
fastaFile <- readAAStringSet("Phosphosite_PTM_seq.fasta")
seq_name = names(fastaFile)
sequence = paste(fastaFile)
df_fasta <- data.frame(seq_name, sequence,stringsAsFactors=F)
df_fasta = cSplit(df_fasta,"seq_name","|",direction="wide")
colnames(df_fasta)[ncol(df_fasta)] = "PROTEIN_ID"
df_fasta[] <- lapply(df_fasta, as.character) # change factor df to character df
df_fasta = unique(df_fasta)

df_fasta_dis = subset(df_fasta, PROTEIN_ID %in% disease_human_p_changed$ACC_ID)       #根据蛋白质ID，从fasta里选出与disease相同的数据
df_fasta_dis$SEQ = paste("_______", toupper(df_fasta_dis$sequence), "_______",sep="")  #添加SEQ列，将sequence变为大写并在前面加7“-”
write.table(df_fasta_dis, "3.df_fasta_dis.txt", quote=FALSE, sep="\t", row.names=FALSE)   #读出文件

# track_back check
disease_human_p_changed_tmp = subset(disease_human_p_changed, ACC_ID %in% df_fasta_dis$PROTEIN_ID )
if(nrow(disease_human_p_changed_tmp) != nrow(disease_human_p_changed)){stop("Error! the number of rows are different from Phosphosite ! track back!")}

#read MS peptides 
MS_data = read.table("huangyu_proteomicsDB_2.txt", header=T, sep="\t",na.strings = "NA", comment.char="", stringsAsFactors=F)

#keep both MS_data and phospho_dis have the same swisprot ID
MS_data_dis = subset(MS_data, PROTEIN_ID %in% df_fasta_dis$PROTEIN_ID)   #根据蛋白质id,从MS数据中筛选出与df_fasta_dis相同的数据
df_fasta_dis = subset(df_fasta_dis, PROTEIN_ID %in% MS_data$PROTEIN_ID)  #根据蛋白质ID，从df_fasta_dis中筛选出MS数据中筛选出相同的数据

MS_data_dis_frequency=as.data.frame(table(MS_data_dis$SEQUENCE), stringsAsFactors=F)    #根据序列得到肽段丰度
MS_data_dis_frequency = subset(MS_data_dis_frequency, Freq>0)
write.table(MS_data_dis_frequency, "3.MS_data_dis_frequency.txt", quote=FALSE, sep="\t", row.names=FALSE)    #读出数据MS_data_dis_frequency

disease_human_p_changed$prot_length = NA
disease_human_p_changed$MS_seq = NA
disease_human_p_changed$MS_seq_max = NA
disease_human_p_changed$MS_seq_max_length = NA      
disease_human_p_changed$MS_seq_max_freq = NA
for(i in 1:nrow(disease_human_p_changed)){
  phospho = disease_human_p_changed[i,]
  df_fasta = subset(df_fasta_dis, PROTEIN_ID %in% phospho$ACC_ID)
  MS_data_tmp = subset(MS_data_dis, PROTEIN_ID %in% phospho$ACC_ID)
  if(length(MS_data_tmp$SEQUENCE)<10){next} # next if protein seq length <10 
  MS_data_freq=as.data.frame(table(MS_data_tmp$SEQUENCE), stringsAsFactors=F) # summerize the MS peptides and frequency
  pos = unlist(gregexpr(pattern =phospho$SITE_...7_AA,df_fasta$SEQ, ignore.case = T)) # position of phosphosite in protein sequence# if( phospho$pos %in% pos){ # if both positions are same (position stored in phosphosite and position by matching with protein sequence)
  if( disease_human_p_changed$pos %in% pos){ # if both positions are same (position stored in phosphosite and position by matching with protein sequence)
  
  
    # Set the locations of MS peptides in protein sequence      
    # return start and end position for each MS peptide
    seq_locations=str_locate(toupper(df_fasta$sequence), MS_data_freq$Var1)      #将da_fasta中seq变为大写，然后与质谱中的肽段匹配
    df_location_range=data.frame(matrix(seq_locations, ncol=2, byrow=F),stringsAsFactors=FALSE)  #等到定位后，另取两列，命名为“pep_start","pep_end"
    colnames(df_location_range) = c("pep_start", "pep_end")
    MS_data_freq = cbind(MS_data_freq, df_location_range)     #将MS_data_freq与df_location_range列合并
    
    # Judge if the phosphosite is located within the MS peptides  
    MS_data_freq$pos_minus_start = phospho$pos - MS_data_freq$pep_start   
    MS_data_freq$pos_minus_end = phospho$pos - MS_data_freq$pep_end
    MS_data_freq_inc_phospho = subset(MS_data_freq, pos_minus_start>0 & pos_minus_end<0)
    # order the df if df has multiple MS peptides containing phosphosite, the one with max freq is ordered at the top
    MS_data_freq_inc_phospho = MS_data_freq_inc_phospho[with(MS_data_freq_inc_phospho, order(-Freq)), ]  #选取丰度最高的肽段

    disease_human_p_changed$prot_length[i] = nchar(df_fasta$sequence)
    disease_human_p_changed$MS_seq[i] = paste(MS_data_freq_inc_phospho$Var1, MS_data_freq_inc_phospho$Freq, collapse=";")
    disease_human_p_changed$MS_seq_max[i] = MS_data_freq_inc_phospho$Var1[1]
    disease_human_p_changed$MS_seq_max_length[i] = nchar(MS_data_freq_inc_phospho$Var1[1])
    disease_human_p_changed$MS_seq_max_freq[i] = MS_data_freq_inc_phospho$Freq[1]
  }
}

write.table(disease_human_p_changed, "3.phospho_dis_MS_comb.txt", quote=FALSE, sep="\t", row.names=FALSE, na="")

disease_human_p_changed_select = subset(disease_human_p_changed, nchar(MS_seq_max)>=MS_seq_length_min & nchar(MS_seq_max)<= MS_seq_length_max )
write.table(disease_human_p_changed_select, "3.phospho_dis_MS_comb_seq_select.txt", quote=FALSE, sep="\t", row.names=FALSE, na="")

rm(list=ls())
library(plyr)

#设置工作目录
setwd("D:/LQ/phosphosite_2017_12_20")

#输入文件
input_fasta_reg="2.df_fasta_reg.txt"    
input_egfr_protein_names="egfr_protein_names.txt"   

#输出文件
outfile_not_in_fasta_protein="not_in_fasta_protein.txt"
outfile_fasta_egfr_protein_names_id="fasta_egfr_protein_names_id.txt"

#读入文件fasta_reg和egfr pathway proteins
fasta_reg= read.table(input_fasta_reg, header=T,sep="\t",stringsAsFactors=F, na.strings = "") 
egfr_protein_names= read.table(input_egfr_protein_names, header=T, sep="\t") 


#拆开蛋白质名字中的"GN:"
vs.test= unlist(strsplit(fasta_reg$seq_name_1, split=":"))
m.test = matrix(vs.test, ncol=2, byrow=T)
#给m.test列命名
colnames(m.test)[1:2]<-c("GN","gene_names")
df.convert = as.data.frame(m.test, stringsAsFactors=F)


#将基因和蛋白质id对应
fasta_gene_protein_id<-data.frame(m.test,fasta_reg$PROTEIN_ID)
#删除第一列“GN”
fasta_gene_protein_id = fasta_gene_protein_id[,-1]

#将egfr中蛋白质名称变为大写
egfr_protein_names<-toupper(egfr_protein_names$protein_names)


#将fasta_protein中的蛋白质与egfr_protein_names比对
fasta_egfr_protein = subset(egfr_protein_names, egfr_protein_names %in% fasta_gene_protein_id$gene_names) 
fasta_egfr_protein=data.frame(fasta_egfr_protein)


#找出不在fasta文件中的蛋白质
not_in_fasta_protein = subset(egfr_protein_names, ! egfr_protein_names %in% fasta_gene_protein_id$gene_names) 
not_in_fasta_protein=data.frame(not_in_fasta_protein)
write.table(not_in_fasta_protein,outfile_not_in_fasta_protein,quote=FALSE, sep="\t", row.names=FALSE, na="")


#找出fasta和egfr共有的蛋白质以及蛋白质id
fasta_egfr_protein_names_id=subset(fasta_gene_protein_id,fasta_gene_protein_id$gene_names%in%fasta_egfr_protein$fasta_egfr_protein)
write.table(fasta_egfr_protein_names_id,outfile_fasta_egfr_protein_names_id,quote=FALSE, sep="\t", row.names=FALSE, na="")

rm(list=ls())

#加载需要的包
library(Biostrings)
library(ShortRead)

setwd("D:/LQ/免疫/OEdata")

#读入PEAR DATA_1,将fastq文件转换为fasta文件
raw_pear<-readFastq("D:/LQ/免疫/raw_pear.fastq")
writeFasta(raw_pear,"raw_pear.fa")


#读入Pear拼接数据的fasta文件
raw_pear_data<-readDNAStringSet("raw_pear.fa")
raw_pear_seq_name=names(raw_pear_data)
raw_pear_seq<-paste(raw_pear_data)
df_raw_pear_data<-data.frame(raw_pear_seq_name,raw_pear_seq,stringsAsFactors=F)


#读入OD DATA
oe_fasta_1<-readDNAStringSet("ATCACG.fa")
oe_seq_name = names(oe_fasta_1)
OE_seq = paste(oe_fasta_1)
df_oe_fasta_1<- data.frame(oe_seq_name, OE_seq,stringsAsFactors=F)


#合并数据
total=merge(df_raw_pear_data,df_oe_fasta_1,by.x="raw_pear_seq_name", by.y="oe_seq_name", all=T)
total_woNA= na.omit(total)

#找出oe和raw的相同与不同序列
total_woNA$seq_identity=(total_woNA$raw_pear_seq == total_woNA$OE_seq)
total_woNA_raw_pear_in_oe = subset(total_woNA, seq_identity)
total_woNA_raw_pear_not_in_oe= subset(total_woNA, ! seq_identity)
a=total_woNA[,2:3]

f <- function(a) {
  grepl(a[2],a[1], fixed=T)
}
total_woNA$seq_included=apply(total_woNA[,2:3], 1, f)
total_woNA_seq_included=subset(total_woNA,seq_included)
total_woNA_seq_not_include=subset(total_woNA,!seq_included)

#消除相同行
total_woNA_seq_included=unique(total_woNA_seq_included)
total_woNA_seq_not_include=unique(df_total_woNA_seq_not_include)

#找出oe不包含raw的序列长度
oe_length_not_included=nchar(total_woNA_seq_not_include$OE_seq)
raw_length_not_included=nchar(total_woNA_seq_not_include$raw_pear_seq)
df_length_not_include=data.frame(oe_length_not_included,raw_length_not_included)

#输出文件
write.table(total_woNA,"D:/LQ/免疫/raw_oe_total.txt",quote = F,sep="\t",row.names = F)
write.table(total_woNA_raw_pear_in_oe,"D:/LQ/免疫/raw_pear_in_oe.txt",quote = F,sep="\t",row.names = F)
write.table(total_woNA_raw_pear_not_in_oe,"D:/LQ/免疫/total_woNA_raw_pear_not_in_oe.txt",quote = F,sep="\t",row.names = F)
write.table(total_woNA_seq_included,"D:/LQ/免疫/oe_seq_included_raw_pear.txt",quote = F,sep="\t",row.names=F)
write.table(total_woNA_seq_not_include,"D:/LQ/免疫/oe_seq_not_included_raw_pear.txt",quote=F,sep="\t",row.names = F)

rm(list=ls())

library(Biostrings)

setwd("D:/LQ/免疫/OEdata")

#读入文件，fasta文件处理
od_data_1<-readDNAStringSet("ATCACG.fa")
seq_name = names(od_data_1)
sequence = paste(od_data_1)
df_od_data_1 <- data.frame(seq_name, sequence,stringsAsFactors=F)
df_od_data_1=unique(df_od_data_1)#消除重复行

#求序列长度
df_od_data_1$length=NA
sequence<-as.vector(df_od_data_1$sequence)
df_od_data_1$length=nchar(sequence)
df_od_data_1=df_od_data_1[order(df_od_data_1$length,decreasing = F),]#按序列长度大小排序，从小到大
write.table(df_od_data_1,"od_data_len.xls",quote = F,sep="\t",row.names = F)

#统计长度出现的频率
freq=table(df_od_data_1$length)        
df_oe_data_len_freq=data.frame(freq)
write.table(df_oe_data_len_freq,"oe_data_len_freq.txt",quote = F,sep="\t",row.names = F)

rm(list=ls())

library(Biostrings)

setwd("D:/LQ/免疫/OEdata")

raw_pear_data<-readDNAStringSet("raw_pear.fa")
raw_pear_seq_name=names(raw_pear_data)
raw_pear_seq<-paste(raw_pear_data)
df_raw_pear_data<-data.frame(raw_pear_seq_name,raw_pear_seq,stringsAsFactors=F)

df_raw_pear_data$length=NA
df_raw_pear_data$length=nchar(raw_pear_seq)
df_raw_pear_data=df_raw_pear_data[order(df_raw_pear_data$length,decreasing=F),]
write.table(df_raw_pear_data,"raw_pear_length.txt",quote = F,sep="\t",row.names = F)

freq=table(df_raw_pear_data$length)
df_raw_pear_len_freq=data.frame(freq)
write.table(df_raw_pear_len_freq,"raw_pear_len_freq.txt",quote = F,sep="\t",row.names = F)

最后编辑于：2018.12.17 17:38:27

人面猴
序言：七十年代末，一起剥皮案震惊了整个滨河市，随后出现的几起案子，更是在滨河造成了极大的恐慌，老刑警刘岩，带你破解...
沈念sama阅读 224,176评论 6赞 522
死咒
序言：滨河连续发生了三起死亡事件，死亡现场离奇诡异，居然都是意外死亡，警方通过查阅死者的电脑和手机，发现死者居然都...
沈念sama阅读 95,928评论 3赞 402
救了他两次的神仙让他今天三更去死
文/潘晓璐我一进店门，熙熙楼的掌柜王于贵愁眉苦脸地迎上来，“玉大人，你说我怎么就摊上这事。” “怎么了？”我有些...
开封第一讲书人阅读 171,252评论 0赞 366
道士缉凶录：失踪的卖姜人
文/不坏的土叔我叫张陵，是天一观的道长。经常有香客问我，道长，这世上最难降的妖魔是什么？我笑而不...
开封第一讲书人阅读 60,700评论 1赞 300
港岛之恋（遗憾婚礼）
正文为了忘掉前任，我火速办了婚礼，结果婚礼上，老公的妹妹穿的比我还像新娘。我一直安慰自己，他们只是感情好，可当我...
茶点故事阅读 69,717评论 6赞 399
恶毒庶女顶嫁案：这布局不是一般人想出来的
文/花漫我一把揭开白布。她就那样静静地躺着，像睡着了一般。火红的嫁衣衬着肌肤如雪。梳的纹丝不乱的头发上，一...
开封第一讲书人阅读 53,231评论 1赞 314
城市分裂传说
那天，我揣着相机与录音，去河边找鬼。笑死，一个胖子当着我的面吹牛，可吹牛的内容都是我干的。我是一名探鬼主播，决...
沈念sama阅读 41,608评论 3赞 428
双鸳鸯连环套：你想象不到人心有多黑
文/苍兰香墨我猛地睁开眼，长吁一口气：“原来是场噩梦啊……” “哼！你这毒妇竟也来了？” 一声冷哼从身侧响起，我...
开封第一讲书人阅读 40,572评论 0赞 279
万荣杀人案实录
序言：老挝万荣一对情侣失踪，失踪者是张志新（化名）和其女友刘颖，没想到半个月后，有当地人在树林里发现了一具尸体，经...
沈念sama阅读 47,117评论 1赞 324
护林员之死
正文独居荒郊野岭守林人离奇死亡，尸身上长有42处带血的脓包…… 初始之章·张勋以下内容为张勋视角年9月15日...
茶点故事阅读 39,137评论 3赞 344
白月光启示录
正文我和宋清朗相恋三年，在试婚纱的时候发现自己被绿了。大学时的朋友给我发了我未婚夫和他白月光在一起吃饭的照片。...
茶点故事阅读 41,280评论 1赞 354
活死人
序言：一个原本活蹦乱跳的男人离奇死亡，死状恐怖，灵堂内的尸体忽然破棺而出，到底是诈尸还是另有隐情，我是刑警宁泽，带...
沈念sama阅读 36,908评论 5赞 350
日本核电站爆炸内幕
正文年R本政府宣布，位于F岛的核电站，受9级特大地震影响，放射性物质发生泄漏。R本人自食恶果不足惜，却给世界环境...
茶点故事阅读 42,597评论 3赞 336
男人毒药：我在死后第九天来索命
文/蒙蒙一、第九天我趴在偏房一处隐蔽的房顶上张望。院中可真热闹，春花似锦、人声如沸。这庄子的主人今日做“春日...
开封第一讲书人阅读 33,067评论 0赞 25
一桩弑父案，背后竟有这般阴谋
文/苍兰香墨我抬头看了看天上的太阳。三九已至，却和暖如春，着一层夹袄步出监牢的瞬间，已是汗流浃背。一阵脚步声响...
开封第一讲书人阅读 34,202评论 1赞 275
情欲美人皮
我被黑心中介骗来泰国打工，没想到刚下飞机就差点儿被人妖公主榨干…… 1. 我叫王不留，地道东北人。一个月前我还...
沈念sama阅读 49,784评论 3赞 380
代替公主和亲
正文我出身青楼，却偏偏与公主长得像，于是被迫代替她去往敌国和亲。传闻我的和亲对象是个残疾皇子，可洞房花烛夜当晚...
茶点故事阅读 46,308评论 2赞 365

【R】R代码记录

R包——Mutational Pattern画突变频谱图

数据提取和筛选1

数据提取和筛选2

推荐阅读更多精彩内容