导读
- 查看文件、获取ID
- 读取数据表
- 读取数据表
一、查看文件、获取ID
关键参数:
list.files(pattern="条件") # 根据条件获取文件名
strsplit(向量, split="部件" # 切除每个向量值的部件
ascharacter() # list返回值转成character
dir() # 查看文件
[1] "bin.25.tsv" "bin.33.tsv" "bin.36.tsv" "bin.37.tsv" "bin.38.tsv"
[6] "bin.39.tsv" "bin.43.tsv" "bin.46.tsv" "bin.50.tsv" "bin.56.tsv"
[11] "bin.58.tsv" "bin.61.tsv" "bin.63.tsv" "bin.65.tsv" "bin.66.tsv"
[16] "bin.67.tsv" "bin.71.tsv" "bin.81.tsv" "bin.89.tsv" "bin.91.tsv"
[21] "bin.94.tsv"
files=list.files(pattern="bin.*.tsv") # 读取所有文件名
files
[1] "bin.25.tsv" "bin.33.tsv" "bin.36.tsv" "bin.37.tsv" "bin.38.tsv"
[6] "bin.39.tsv" "bin.43.tsv" "bin.46.tsv" "bin.50.tsv" "bin.56.tsv"
[11] "bin.58.tsv" "bin.61.tsv" "bin.63.tsv" "bin.65.tsv" "bin.66.tsv"
[16] "bin.67.tsv" "bin.71.tsv" "bin.81.tsv" "bin.89.tsv" "bin.91.tsv"
[21] "bin.94.tsv"
Bin_ID=vector()
for(i in 1:length(files))
{
Bin_ID[i]=as.character(strsplit(files[i], split=".tsv"))
# 提取所有文件名
}
Bin_ID
[1] "bin.25" "bin.33" "bin.36" "bin.37" "bin.38" "bin.39" "bin.43" "bin.46"
[9] "bin.50" "bin.56" "bin.58" "bin.61" "bin.63" "bin.65" "bin.66" "bin.67"
[17] "bin.71" "bin.81" "bin.89" "bin.91" "bin.94"
二、读取数据表
关键参数:
read.table()使用多参数
ml=list() # 定义列表
for(i in 1:length(files))
{
ml[[i]]=read.table(files[i], sep='\t', na.string="", stringsAsFactors=F, header=T, quote="", comment.char="")
# 读取所有数据框到列表ml
}
summary(ml) # ml列表信息
Length Class Mode
[1,] 7 data.frame list
[2,] 7 data.frame list
[3,] 7 data.frame list
[4,] 7 data.frame list
[5,] 7 data.frame list
[6,] 7 data.frame list
[7,] 7 data.frame list
[8,] 7 data.frame list
[9,] 7 data.frame list
[10,] 7 data.frame list
[11,] 7 data.frame list
[12,] 7 data.frame list
[13,] 7 data.frame list
[14,] 7 data.frame list
[15,] 7 data.frame list
[16,] 7 data.frame list
[17,] 7 data.frame list
[18,] 7 data.frame list
[19,] 7 data.frame list
[20,] 7 data.frame list
[21,] 7 data.frame list
head(ml[[1]]) # 打开1#列表,查看基本信息
locus_tag ftype length_bp gene EC_number COG
1 LBILEGMC_00001 CDS 324 <NA> <NA> <NA>
2 LBILEGMC_00002 CDS 2589 tmoS_1 2.7.13.3 <NA>
3 LBILEGMC_00003 CDS 852 <NA> <NA> <NA>
4 LBILEGMC_00004 CDS 1164 <NA> <NA> <NA>
5 LBILEGMC_00005 CDS 1356 <NA> <NA> <NA>
6 LBILEGMC_00006 CDS 975 <NA> 2.5.1.10 COG0142
product
1 hypothetical protein
2 Sensor histidine kinase TmoS
3 hypothetical protein
4 hypothetical protein
5 hypothetical protein
6 (2E,6E)-farnesyl diphosphate synthase
三、统计“ftype”的各值频数
CDS_num=vector()
rRNA_num=vector()
tRNA_num=vector()
tmRNA_num=vector()
# 新建向量,用来存储各值在各个数据框中的频数
# 预先已知ftype分类变量有四个值
for(i in 1:length(files))
{
CDS_num[i]=0
rRNA_num[i]=0
tRNA_num[i]=0
tmRNA_num[i]=0
# 给每个向量赋初值
for(j in 1:length(ml[[i]][,"ftype"]))
{
if(ml[[i]][j, "ftype"]=="CDS")
{
CDS_num[i]=CDS_num[i]+1
}
else if(ml[[i]][j, "ftype"]=="rRNA")
{
rRNA_num[i]=rRNA_num[i]+1
}
else if(ml[[i]][j, "ftype"]=="tRNA")
{
tRNA_num[i]=tRNA_num[i]+1
}
else if(ml[[i]][j, "ftype"]=="tmRNA")
{
tmRNA_num[i]=tmRNA_num[i]+1
}
}
}
prokka_result=data.frame(Bin_ID, CDS_num, tRNA_num, rRNA_num, tmRNA_num)
# 结果汇总成表
write.table(prokka_result, file="prokka_result.txt", sep="\t", quote=F, row.names=F)
# 结果保存
查看结果文件: