1 一共八个物种
拟南芥:
wget -c ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/plant/Arabidopsis_thaliana/all_assembly_versions/GCF_000001735.3_TAIR10
(其他7类已下载)
2 将各个物种的refseq单独与基因进行对比
过程与之前类似:
-
建库
makeblastdb -in GCF_000001735.3_TAIR10_rna.fna -input_type fasta -dbtype nucl -title Arabidopsis_thaliana_rna -parse_seqids
-
比对
blastn -query AER314_4_gene.fa -db GCF_000001735.3_TAIR10_rna.fna -out db_blast_TAIR10 -outfmt 6 -evalue 1e-10 -num_threads 4
3 使用R进行merge
DF1<-read.csv("lyr/rna-seq/blast/geneID/TAIR10.csv") #读取gene.csv
DF2<-read.csv("lyr/rna-seq/blast/data/db_blast_TAIR10.csv") #读取db_blast3.csv
dim(DF1) #看一下表格维度
dim(DF2)
merge(DF1,DF2,by ="RNA_nucleotide_accession.version", all.y = TRUE)
data1<-(merge(DF1,DF2,by ="RNA_nucleotide_accession.version", all.y = TRUE)) #将merge结果写道data中
write.csv(data1, file = "result_TAIR10.csv", quote = FALSE, row.names = FALSE) #输出文件为result_data.csv